From d4a8ea30fe516ca0b959011c5eb6562af717e78f6e2309c558371a1c289dbcd5 Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Thu, 3 Jul 2025 20:59:50 +0200 Subject: [PATCH] Initial import Signed-off-by: Egbert Eich --- 0001-Hardcode-cblas-as-the-blas-library.patch | 26 + 0001-add-gfx1103-support-for-rocBLAS.patch | 599897 +++++++++++++++ 0001-fixup-install-of-tensile-output.patch | 25 + 0001-offload-compress-option.patch | 44 + ...-option-to-disable-roctracer-logging.patch | 82 + 0001-prepare-rocblas-cmake-for-fedora.patch | 26 + ...allow-to-build-modules-independently.patch | 80 + _constraints | 14 + rocBLAS-6.4.0.tar.gz | 3 + rocblas.rpmlintrc | 3 + rocblas.spec | 551 + 11 files changed, 600751 insertions(+) create mode 100644 0001-Hardcode-cblas-as-the-blas-library.patch create mode 100644 0001-add-gfx1103-support-for-rocBLAS.patch create mode 100644 0001-fixup-install-of-tensile-output.patch create mode 100644 0001-offload-compress-option.patch create mode 100644 0001-option-to-disable-roctracer-logging.patch create mode 100644 0001-prepare-rocblas-cmake-for-fedora.patch create mode 100644 Modify-CMakeLists.txt-files-to-allow-to-build-modules-independently.patch create mode 100644 _constraints create mode 100644 rocBLAS-6.4.0.tar.gz create mode 100644 rocblas.rpmlintrc create mode 100644 rocblas.spec diff --git a/0001-Hardcode-cblas-as-the-blas-library.patch b/0001-Hardcode-cblas-as-the-blas-library.patch new file mode 100644 index 0000000..416b342 --- /dev/null +++ b/0001-Hardcode-cblas-as-the-blas-library.patch @@ -0,0 +1,26 @@ +From e2a44fe6ad3bcf3c6df84b80d413ed09e1428e72 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 13 Jan 2024 07:48:37 -0500 +Subject: [PATCH] Hardcode cblas as the blas library + +Signed-off-by: Tom Rix +--- + clients/gtest/CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt +index 9f17fb28..3df9eab4 100644 +--- a/clients/gtest/CMakeLists.txt ++++ b/clients/gtest/CMakeLists.txt +@@ -153,7 +153,7 @@ target_include_directories( rocblas-test + if( BUILD_FORTRAN_CLIENTS ) + target_link_libraries( rocblas-test PRIVATE rocblas_fortran_client ) + endif( ) +-target_link_libraries( rocblas-test PRIVATE ${BLAS_LIBRARY} ${GTEST_BOTH_LIBRARIES} roc::rocblas ) ++target_link_libraries( rocblas-test PRIVATE cblas ${GTEST_BOTH_LIBRARIES} roc::rocblas ) + + if( CUDA_FOUND ) + target_include_directories( rocblas-test +-- +2.43.0 + diff --git a/0001-add-gfx1103-support-for-rocBLAS.patch b/0001-add-gfx1103-support-for-rocBLAS.patch new file mode 100644 index 0000000..29fb1ae --- /dev/null +++ b/0001-add-gfx1103-support-for-rocBLAS.patch @@ -0,0 +1,599897 @@ +From a1270dbf573619f165ce9dabef08d1640e2a39dd Mon Sep 17 00:00:00 2001 +From: Yifan Zhang +Date: Wed, 24 Apr 2024 22:27:36 +0800 +Subject: [PATCH] add gfx1103 support for rocBLAS + +Signed-off-by: Yifan Zhang +--- + CMakeLists.txt | 2 +- + .../phoenix_Cijk_Ailk_Bjlk_BBS_BH.yaml | 16503 +++++++++ + .../phoenix_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml | 16503 +++++++++ + .../phoenix/phoenix_Cijk_Ailk_Bjlk_HB.yaml | 17853 ++++++++++ + .../phoenix/phoenix_Cijk_Ailk_Bjlk_HB_GB.yaml | 17853 ++++++++++ + .../phoenix_Cijk_Ailk_Bjlk_HHS_BH.yaml | 8943 +++++ + .../phoenix_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml | 8943 +++++ + .../phoenix_Cijk_Ailk_Bjlk_I8II_BH.yaml | 22983 +++++++++++++ + .../phoenix_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml | 22983 +++++++++++++ + .../phoenix/phoenix_Cijk_Ailk_Bjlk_SB.yaml | 310 + + .../phoenix_Cijk_Ailk_Bljk_BBS_BH.yaml | 14343 ++++++++ + .../phoenix_Cijk_Ailk_Bljk_BBS_BH_GB.yaml | 14343 ++++++++ + .../phoenix/phoenix_Cijk_Ailk_Bljk_HB.yaml | 22713 +++++++++++++ + .../phoenix/phoenix_Cijk_Ailk_Bljk_HB_GB.yaml | 22713 +++++++++++++ + .../phoenix_Cijk_Ailk_Bljk_HHS_BH.yaml | 27843 +++++++++++++++ + .../phoenix_Cijk_Ailk_Bljk_HHS_BH_GB.yaml | 27843 +++++++++++++++ + .../phoenix_Cijk_Ailk_Bljk_I8II_BH.yaml | 28113 ++++++++++++++++ + .../phoenix_Cijk_Ailk_Bljk_I8II_BH_GB.yaml | 28113 ++++++++++++++++ + .../phoenix/phoenix_Cijk_Ailk_Bljk_SB.yaml | 310 + + .../phoenix_Cijk_Alik_Bjlk_BBS_BH.yaml | 21903 ++++++++++++ + .../phoenix_Cijk_Alik_Bjlk_BBS_BH_GB.yaml | 21903 ++++++++++++ + .../phoenix/phoenix_Cijk_Alik_Bjlk_HB.yaml | 16503 +++++++++ + .../phoenix/phoenix_Cijk_Alik_Bjlk_HB_GB.yaml | 16503 +++++++++ + .../phoenix_Cijk_Alik_Bjlk_HHS_BH.yaml | 17313 ++++++++++ + .../phoenix_Cijk_Alik_Bjlk_HHS_BH_GB.yaml | 17313 ++++++++++ + .../phoenix_Cijk_Alik_Bjlk_I8II_BH.yaml | 26493 +++++++++++++++ + .../phoenix_Cijk_Alik_Bjlk_I8II_BH_GB.yaml | 26493 +++++++++++++++ + .../phoenix/phoenix_Cijk_Alik_Bjlk_SB.yaml | 310 + + .../phoenix_Cijk_Alik_Bljk_BBS_BH.yaml | 9213 +++++ + .../phoenix_Cijk_Alik_Bljk_BBS_BH_GB.yaml | 9213 +++++ + .../phoenix/phoenix_Cijk_Alik_Bljk_HB.yaml | 10833 ++++++ + .../phoenix/phoenix_Cijk_Alik_Bljk_HB_GB.yaml | 10833 ++++++ + .../phoenix_Cijk_Alik_Bljk_HHS_BH.yaml | 15423 +++++++++ + .../phoenix_Cijk_Alik_Bljk_HHS_BH_GB.yaml | 15423 +++++++++ + .../phoenix_Cijk_Alik_Bljk_I8II_BH.yaml | 22173 ++++++++++++ + .../phoenix_Cijk_Alik_Bljk_I8II_BH_GB.yaml | 22173 ++++++++++++ + .../phoenix/phoenix_Cijk_Alik_Bljk_SB.yaml | 310 + + library/src/handle.cpp | 4 + + library/src/include/handle.hpp | 3 +- + library/src/tensile_host.cpp | 4 + + 40 files changed, 599547 insertions(+), 2 deletions(-) + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_SB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_SB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_SB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH_GB.yaml + create mode 100644 library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_SB.yaml + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index f496737e..bb853624 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -110,7 +110,7 @@ list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/llvm ${ROCM_PATH} ${ROCM_PATH}/hip / + # setting target list based on ROCm version + set( TARGET_LIST_ROCM_5.6 "gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102") + set( TARGET_LIST_ROCM_5.7 "gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102") +-set( TARGET_LIST_ROCM_6.0 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102") ++set( TARGET_LIST_ROCM_6.0 "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1103") + + if(ROCM_PLATFORM_VERSION) + if(${ROCM_PLATFORM_VERSION} VERSION_LESS 5.7.0) +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH.yaml +new file mode 100644 +index 00000000..5c0bec10 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH.yaml +@@ -0,0 +1,16503 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS0_GRVW8_PLR1_SIA3_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 30.7249] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [26, 51.6133] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [0, 87.5345] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [39, 141.834] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 202.673] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 260.064] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [42, 292.868] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [2, 54.3585] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [27, 99.4288] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [39, 172.975] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [39, 277.622] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [51, 400.909] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 519.435] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 584.949] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [26, 126.107] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [52, 236.646] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [39, 407.61] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [39, 633.485] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 874.407] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [45, 1087.56] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 1204.61] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [9, 272.145] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [37, 495.722] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [45, 844.776] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [39, 1307.76] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [53, 1801.19] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [42, 2232.65] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 2441.21] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [39, 567.718] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [43, 1078.37] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [52, 1809.06] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [40, 2774.47] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 3752.04] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [43, 4494.46] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 4962.21] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [43, 1198.37] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [39, 2173.5] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [37, 3615.39] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [52, 5323.99] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [52, 7465.67] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [44, 9106.61] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [40, 9173.37] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [5, 2062.6] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [22, 3754.97] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [24, 6284.81] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [7, 9496.76] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [5, 13129.0] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [23, 15629.4] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [24, 16788.0] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [2, 62.2892] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [57, 99.7888] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [45, 176.202] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [39, 278.008] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [45, 402.467] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 519.017] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [41, 582.644] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [28, 135.51] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [34, 265.16] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [52, 439.794] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [15, 650.331] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 905.116] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [45, 1111.77] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [39, 1214.05] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [28, 333.305] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [45, 589.502] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [45, 979.297] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [45, 1456.86] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [39, 1914.34] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [52, 2307.1] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 2480.05] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [25, 678.032] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [45, 1202.84] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [45, 1992.32] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [43, 3033.85] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [39, 4018.98] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [46, 4681.47] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [42, 5033.67] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [45, 1386.56] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [52, 2617.35] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [52, 4254.4] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [45, 5978.52] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [42, 8119.64] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [45, 9423.09] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [42, 10243.1] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [6, 2722.25] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [45, 4679.18] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [48, 7688.04] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [42, 11409.2] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [39, 15393.7] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [46, 18850.2] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [1, 16319.5] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [24, 4496.71] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [15, 8042.79] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [8, 13359.0] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [23, 20091.0] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [24, 27069.6] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [8, 32463.8] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 32971.3] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [2, 139.346] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [39, 229.147] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [39, 394.053] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [52, 630.344] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [38, 875.319] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [39, 1085.69] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 1199.89] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [33, 314.274] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [28, 592.249] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [52, 978.492] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [39, 1416.64] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [52, 1946.54] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [52, 2302.19] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 2497.22] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [9, 679.022] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [39, 1206.13] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [45, 1972.63] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [50, 2934.11] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [48, 4008.9] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [48, 4689.49] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 5033.58] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [52, 1464.24] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [45, 2606.37] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [48, 4221.21] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [55, 5979.05] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [48, 7924.53] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [39, 9532.18] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 10030.3] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [13, 2770.35] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [45, 4902.04] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [13, 7787.98] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [46, 11479.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [42, 15410.5] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [55, 18775.0] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 20118.8] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [33, 4644.87] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [6, 8300.44] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [24, 13617.9] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [15, 20187.7] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [15, 26389.7] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 31808.0] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 34198.9] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [26, 8185.01] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [8, 14367.1] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [7, 21442.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [13, 27799.9] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [6, 33428.1] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [6, 36071.0] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [13, 38653.0] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [2, 215.196] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [45, 358.733] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [45, 610.88] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [49, 945.16] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [39, 1351.04] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [41, 1636.59] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 1809.91] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [18, 486.881] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [52, 876.858] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [39, 1448.81] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [52, 2157.38] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [52, 2900.63] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [42, 3499.39] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [55, 3726.55] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [56, 1086.42] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [39, 1929.01] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [48, 3010.28] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [39, 4460.44] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [39, 5901.59] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [55, 7120.05] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 7472.88] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [55, 2105.93] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [37, 3720.55] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [39, 6111.18] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [55, 8732.11] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [48, 11650.8] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [54, 13892.7] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [46, 14912.5] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [15, 3460.19] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [29, 6184.79] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [24, 10509.8] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [24, 15533.3] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [15, 20049.5] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [22, 23903.4] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [15, 25873.3] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [33, 5974.08] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [24, 11479.4] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [15, 17897.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [8, 24819.9] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [6, 30470.6] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [23, 33605.6] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [6, 36697.9] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [33, 10317.0] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [19, 16366.7] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [52, 23518.1] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [55, 29611.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [55, 32562.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [48, 35377.9] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [42, 36190.9] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [18, 469.512] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [37, 787.022] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [24, 1303.66] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [39, 1976.27] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [48, 2740.63] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [48, 3280.97] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [47, 3603.29] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [9, 970.454] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [43, 1825.46] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [52, 2986.33] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [45, 4419.72] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [48, 5939.9] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [48, 6919.87] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [48, 7420.68] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [29, 2114.06] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [8, 3705.22] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [45, 5881.93] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [45, 8712.44] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [39, 11451.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [42, 13812.2] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [55, 14830.2] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [15, 3621.5] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [13, 6408.43] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [8, 10534.0] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [42, 15170.4] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [24, 20003.6] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [24, 24272.4] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [8, 25795.0] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [29, 6152.28] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [16, 10627.5] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [24, 17379.8] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [22, 24322.1] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 30392.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [13, 33606.3] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 36829.2] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [36, 10766.1] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [19, 17115.3] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [10, 24102.3] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [55, 29612.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [55, 33057.9] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [55, 36158.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 38116.5] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [10, 13569.2] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [9, 20596.1] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [20, 28041.9] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [55, 31764.4] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [55, 35959.2] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [5, 38610.0] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [12, 39510.0] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [17, 946.233] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [45, 1601.29] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [24, 2632.96] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [52, 3934.62] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [48, 5351.02] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [48, 6619.77] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [48, 7303.71] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [15, 1888.76] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [52, 3539.99] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [39, 6041.49] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [52, 8806.94] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [52, 11234.8] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [48, 13761.7] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [46, 14106.4] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [29, 3585.9] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [15, 6423.15] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [8, 10526.4] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [24, 15462.9] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [24, 20284.8] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [22, 24015.4] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [15, 26034.8] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [31, 6139.5] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [24, 11486.0] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [24, 17816.5] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [24, 24818.4] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [15, 30258.8] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [13, 33568.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [6, 36809.0] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [33, 10476.0] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [19, 17247.2] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [24, 24158.7] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [48, 29792.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [55, 32800.7] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 36051.4] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 37710.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [29, 13608.7] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [35, 20714.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [4, 27674.7] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [24, 31745.0] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [55, 36001.8] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [23, 38558.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [12, 39599.1] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [29, 16167.6] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [55, 23589.1] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [3, 29528.2] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [55, 34534.8] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [55, 37987.8] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [55, 39263.6] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [21, 39568.3] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [12, 1677.05] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [6, 2923.55] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [23, 4878.52] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [14, 7309.82] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [7, 9832.81] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [13, 11891.0] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [5, 12772.5] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [24, 3723.32] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [22, 6671.77] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [24, 10899.0] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [15, 15927.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 20958.4] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 24666.3] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 26183.3] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [32, 7056.17] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [15, 11948.2] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [15, 18416.3] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [24, 25338.5] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [15, 31568.3] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [22, 33824.4] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [6, 37041.4] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [34, 10352.1] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [9, 16704.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [15, 23610.5] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [20, 30053.3] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [6, 32774.0] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [15, 36148.1] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 37999.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [19, 13599.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [18, 20720.1] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [19, 27637.7] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [55, 32050.1] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [55, 36124.6] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [23, 38669.3] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [12, 39553.2] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [35, 16164.3] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [11, 23552.5] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [19, 29435.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [55, 34564.0] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [55, 37989.5] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [55, 39266.9] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [12, 39794.0] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [30, 17077.5] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [20, 24680.7] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [20, 31525.4] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [55, 36130.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [55, 38439.8] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [55, 38969.9] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 39162.7] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [7, 1825.59] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [12, 3877.78] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [7, 6455.27] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [12, 9703.46] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [5, 13158.6] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 15595.8] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [5, 16933.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [26, 5161.43] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [32, 9529.84] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [8, 15379.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [8, 22033.7] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [15, 27741.0] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [24, 32030.1] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 33942.7] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [29, 8763.26] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [15, 14527.3] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [15, 21554.2] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [24, 27741.0] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [13, 33455.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [13, 36019.6] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [13, 38550.0] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [30, 11749.8] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [30, 18454.3] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [20, 25520.3] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [24, 31776.9] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [24, 34586.7] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [6, 37775.6] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [13, 39494.7] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [33, 14779.3] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [19, 22062.7] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [19, 28909.1] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [55, 33080.5] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [55, 36944.1] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [55, 39054.4] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 39165.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [33, 16976.7] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [20, 23385.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [11, 30381.9] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [55, 35334.5] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [55, 38517.3] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [48, 39106.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 40079.5] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [33, 17567.5] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [20, 25017.0] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [19, 31303.8] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [55, 36382.3] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [55, 38332.7] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [21, 39483.2] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [24, 39187.2] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml +new file mode 100644 +index 00000000..e109818c +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml +@@ -0,0 +1,16503 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS0_GRVW8_PLR1_SIA3_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 30.7249] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [26, 51.6133] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [0, 87.5345] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [39, 141.834] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 202.673] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 260.064] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [42, 292.868] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [2, 54.3585] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [27, 99.4288] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [39, 172.975] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [39, 277.622] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [51, 400.909] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 519.435] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 584.949] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [26, 126.107] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [52, 236.646] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [39, 407.61] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [39, 633.485] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 874.407] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [45, 1087.56] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 1204.61] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [9, 272.145] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [37, 495.722] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [45, 844.776] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [39, 1307.76] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [53, 1801.19] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [42, 2232.65] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 2441.21] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [39, 567.718] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [43, 1078.37] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [52, 1809.06] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [40, 2774.47] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 3752.04] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [43, 4494.46] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 4962.21] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [43, 1198.37] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [39, 2173.5] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [37, 3615.39] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [52, 5323.99] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [52, 7465.67] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [44, 9106.61] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [40, 9173.37] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [5, 2062.6] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [22, 3754.97] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [24, 6284.81] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [7, 9496.76] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [5, 13129.0] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [23, 15629.4] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [24, 16788.0] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [2, 62.2892] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [57, 99.7888] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [45, 176.202] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [39, 278.008] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [45, 402.467] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 519.017] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [41, 582.644] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [28, 135.51] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [34, 265.16] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [52, 439.794] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [15, 650.331] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 905.116] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [45, 1111.77] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [39, 1214.05] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [28, 333.305] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [45, 589.502] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [45, 979.297] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [45, 1456.86] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [39, 1914.34] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [52, 2307.1] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 2480.05] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [25, 678.032] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [45, 1202.84] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [45, 1992.32] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [43, 3033.85] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [39, 4018.98] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [46, 4681.47] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [42, 5033.67] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [45, 1386.56] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [52, 2617.35] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [52, 4254.4] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [45, 5978.52] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [42, 8119.64] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [45, 9423.09] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [42, 10243.1] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [6, 2722.25] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [45, 4679.18] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [48, 7688.04] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [42, 11409.2] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [39, 15393.7] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [46, 18850.2] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [1, 16319.5] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [24, 4496.71] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [15, 8042.79] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [8, 13359.0] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [23, 20091.0] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [24, 27069.6] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [8, 32463.8] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 32971.3] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [2, 139.346] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [39, 229.147] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [39, 394.053] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [52, 630.344] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [38, 875.319] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [39, 1085.69] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 1199.89] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [33, 314.274] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [28, 592.249] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [52, 978.492] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [39, 1416.64] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [52, 1946.54] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [52, 2302.19] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 2497.22] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [9, 679.022] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [39, 1206.13] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [45, 1972.63] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [50, 2934.11] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [48, 4008.9] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [48, 4689.49] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 5033.58] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [52, 1464.24] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [45, 2606.37] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [48, 4221.21] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [55, 5979.05] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [48, 7924.53] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [39, 9532.18] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 10030.3] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [13, 2770.35] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [45, 4902.04] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [13, 7787.98] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [46, 11479.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [42, 15410.5] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [55, 18775.0] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 20118.8] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [33, 4644.87] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [6, 8300.44] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [24, 13617.9] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [15, 20187.7] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [15, 26389.7] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 31808.0] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 34198.9] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [26, 8185.01] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [8, 14367.1] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [7, 21442.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [13, 27799.9] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [6, 33428.1] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [6, 36071.0] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [13, 38653.0] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [2, 215.196] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [45, 358.733] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [45, 610.88] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [49, 945.16] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [39, 1351.04] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [41, 1636.59] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 1809.91] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [18, 486.881] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [52, 876.858] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [39, 1448.81] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [52, 2157.38] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [52, 2900.63] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [42, 3499.39] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [55, 3726.55] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [56, 1086.42] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [39, 1929.01] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [48, 3010.28] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [39, 4460.44] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [39, 5901.59] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [55, 7120.05] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 7472.88] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [55, 2105.93] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [37, 3720.55] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [39, 6111.18] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [55, 8732.11] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [48, 11650.8] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [54, 13892.7] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [46, 14912.5] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [15, 3460.19] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [29, 6184.79] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [24, 10509.8] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [24, 15533.3] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [15, 20049.5] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [22, 23903.4] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [15, 25873.3] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [33, 5974.08] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [24, 11479.4] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [15, 17897.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [8, 24819.9] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [6, 30470.6] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [23, 33605.6] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [6, 36697.9] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [33, 10317.0] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [19, 16366.7] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [52, 23518.1] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [55, 29611.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [55, 32562.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [48, 35377.9] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [42, 36190.9] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [18, 469.512] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [37, 787.022] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [24, 1303.66] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [39, 1976.27] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [48, 2740.63] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [48, 3280.97] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [47, 3603.29] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [9, 970.454] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [43, 1825.46] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [52, 2986.33] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [45, 4419.72] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [48, 5939.9] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [48, 6919.87] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [48, 7420.68] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [29, 2114.06] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [8, 3705.22] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [45, 5881.93] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [45, 8712.44] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [39, 11451.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [42, 13812.2] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [55, 14830.2] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [15, 3621.5] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [13, 6408.43] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [8, 10534.0] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [42, 15170.4] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [24, 20003.6] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [24, 24272.4] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [8, 25795.0] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [29, 6152.28] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [16, 10627.5] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [24, 17379.8] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [22, 24322.1] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 30392.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [13, 33606.3] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 36829.2] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [36, 10766.1] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [19, 17115.3] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [10, 24102.3] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [55, 29612.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [55, 33057.9] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [55, 36158.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 38116.5] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [10, 13569.2] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [9, 20596.1] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [20, 28041.9] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [55, 31764.4] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [55, 35959.2] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [5, 38610.0] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [12, 39510.0] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [17, 946.233] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [45, 1601.29] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [24, 2632.96] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [52, 3934.62] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [48, 5351.02] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [48, 6619.77] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [48, 7303.71] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [15, 1888.76] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [52, 3539.99] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [39, 6041.49] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [52, 8806.94] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [52, 11234.8] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [48, 13761.7] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [46, 14106.4] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [29, 3585.9] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [15, 6423.15] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [8, 10526.4] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [24, 15462.9] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [24, 20284.8] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [22, 24015.4] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [15, 26034.8] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [31, 6139.5] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [24, 11486.0] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [24, 17816.5] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [24, 24818.4] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [15, 30258.8] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [13, 33568.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [6, 36809.0] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [33, 10476.0] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [19, 17247.2] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [24, 24158.7] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [48, 29792.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [55, 32800.7] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 36051.4] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 37710.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [29, 13608.7] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [35, 20714.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [4, 27674.7] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [24, 31745.0] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [55, 36001.8] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [23, 38558.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [12, 39599.1] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [29, 16167.6] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [55, 23589.1] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [3, 29528.2] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [55, 34534.8] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [55, 37987.8] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [55, 39263.6] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [21, 39568.3] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [12, 1677.05] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [6, 2923.55] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [23, 4878.52] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [14, 7309.82] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [7, 9832.81] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [13, 11891.0] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [5, 12772.5] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [24, 3723.32] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [22, 6671.77] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [24, 10899.0] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [15, 15927.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 20958.4] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 24666.3] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 26183.3] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [32, 7056.17] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [15, 11948.2] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [15, 18416.3] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [24, 25338.5] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [15, 31568.3] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [22, 33824.4] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [6, 37041.4] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [34, 10352.1] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [9, 16704.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [15, 23610.5] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [20, 30053.3] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [6, 32774.0] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [15, 36148.1] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 37999.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [19, 13599.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [18, 20720.1] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [19, 27637.7] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [55, 32050.1] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [55, 36124.6] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [23, 38669.3] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [12, 39553.2] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [35, 16164.3] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [11, 23552.5] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [19, 29435.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [55, 34564.0] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [55, 37989.5] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [55, 39266.9] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [12, 39794.0] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [30, 17077.5] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [20, 24680.7] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [20, 31525.4] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [55, 36130.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [55, 38439.8] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [55, 38969.9] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 39162.7] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [7, 1825.59] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [12, 3877.78] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [7, 6455.27] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [12, 9703.46] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [5, 13158.6] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 15595.8] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [5, 16933.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [26, 5161.43] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [32, 9529.84] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [8, 15379.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [8, 22033.7] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [15, 27741.0] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [24, 32030.1] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 33942.7] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [29, 8763.26] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [15, 14527.3] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [15, 21554.2] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [24, 27741.0] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [13, 33455.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [13, 36019.6] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [13, 38550.0] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [30, 11749.8] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [30, 18454.3] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [20, 25520.3] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [24, 31776.9] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [24, 34586.7] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [6, 37775.6] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [13, 39494.7] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [33, 14779.3] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [19, 22062.7] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [19, 28909.1] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [55, 33080.5] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [55, 36944.1] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [55, 39054.4] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 39165.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [33, 16976.7] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [20, 23385.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [11, 30381.9] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [55, 35334.5] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [55, 38517.3] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [48, 39106.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 40079.5] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [33, 17567.5] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [20, 25017.0] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [19, 31303.8] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [55, 36382.3] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [55, 38332.7] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [21, 39483.2] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [24, 39187.2] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB.yaml +new file mode 100644 +index 00000000..adb548a8 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB.yaml +@@ -0,0 +1,17853 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [3, 34.9153] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [11, 58.2736] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [33, 98.8013] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [45, 152.299] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [46, 214.598] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [46, 271.239] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [45, 301.196] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [9, 61.71] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [29, 112.412] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [11, 191.381] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [46, 298.633] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [51, 426.641] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [61, 540.086] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 602.122] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [29, 149.2] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [32, 268.212] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [46, 448.829] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [46, 665.763] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [45, 922.18] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 1138.1] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 1240.21] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [23, 329.223] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [46, 581.573] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [45, 963.322] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [52, 1406.19] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [46, 1925.32] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [61, 2332.03] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [52, 2525.07] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [46, 644.385] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [45, 1150.23] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [59, 1932.87] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [60, 2905.15] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [59, 3908.95] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [59, 4710.06] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [51, 5090.37] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [16, 1293.74] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [55, 2312.82] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [53, 3847.1] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [57, 5772.31] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [60, 7770.84] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [55, 9587.67] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [0, 8013.37] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [24, 2399.14] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [18, 4148.16] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [40, 6804.79] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [18, 10149.6] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [5, 13540.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [32, 16308.4] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [10, 17122.4] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [9, 70.167] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [15, 121.857] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [8, 197.882] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [8, 303.913] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [55, 424.934] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 539.305] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 604.454] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [27, 168.392] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [1, 289.183] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [19, 474.576] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [52, 700.57] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 957.057] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [48, 1167.11] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [62, 1248.14] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [62, 366.635] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [8, 653.726] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [59, 1055.57] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [52, 1546.0] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [46, 2044.51] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [45, 2397.95] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 2538.78] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [62, 776.867] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [56, 1360.69] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [46, 2213.94] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [49, 3190.8] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 4095.25] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [61, 4822.78] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [56, 5149.25] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [33, 1599.96] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [46, 2781.37] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [11, 4393.67] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [44, 6307.83] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [59, 8252.44] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [58, 9670.92] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 10377.1] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [59, 2794.81] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [52, 4949.03] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [59, 8096.18] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [50, 11933.7] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [47, 15958.4] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [58, 19716.9] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [47, 18578.1] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [25, 5083.23] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [19, 8912.2] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [8, 14500.7] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [30, 21445.7] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [11, 28067.3] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [24, 32858.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [33, 33553.4] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [29, 157.728] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [15, 274.892] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [46, 431.027] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [49, 661.562] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 920.914] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [61, 1146.42] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [46, 1233.98] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [19, 365.485] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [11, 653.522] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [59, 1058.5] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [46, 1543.3] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 2041.28] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [59, 2387.62] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [59, 2536.38] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [33, 787.663] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [4, 1365.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [52, 2210.15] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [59, 3105.76] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [52, 4094.0] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [56, 4833.37] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [52, 5127.12] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [11, 1600.88] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [59, 2655.04] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [46, 4520.35] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [59, 6483.34] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [59, 8388.61] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 9743.58] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [59, 10426.1] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [46, 2973.1] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [59, 5197.4] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [46, 8447.77] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [44, 12321.5] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [59, 16261.9] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 19341.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [46, 15925.5] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [31, 5006.65] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [31, 8861.6] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [19, 14450.7] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [19, 21171.7] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [11, 27292.5] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [8, 32448.2] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [19, 34710.7] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [46, 8869.79] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [59, 14643.0] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [25, 23511.9] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [25, 30297.5] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [7, 35413.7] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [7, 37152.4] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [11, 39461.3] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [27, 244.006] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [8, 399.865] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [59, 690.762] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [46, 1051.91] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 1401.06] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [59, 1707.84] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [60, 1858.73] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [8, 557.756] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [11, 999.278] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [59, 1553.64] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [46, 2289.47] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 3020.93] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [52, 3573.55] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 3783.91] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [33, 1127.91] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [56, 1963.63] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [46, 3210.75] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [46, 4681.59] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [46, 6143.62] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 7282.05] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [44, 7603.12] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [8, 2291.13] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [59, 3987.63] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [49, 6472.69] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [52, 9422.76] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 12321.1] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [49, 14284.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [62, 15235.6] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [31, 3940.16] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [33, 6951.91] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [11, 11229.7] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [8, 15833.8] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [41, 20747.9] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [40, 24599.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [11, 26273.9] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [59, 6458.57] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [36, 11091.2] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [59, 18470.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [31, 25392.8] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [41, 31675.0] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [8, 34049.6] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 37367.5] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [42, 12273.1] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [59, 19450.0] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [59, 26313.8] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [52, 31963.0] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [46, 33835.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [46, 36773.3] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 36908.1] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [2, 535.717] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [12, 918.863] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [45, 1455.34] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [55, 2118.34] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 2845.37] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [58, 3419.04] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 3699.5] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [33, 1096.65] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [33, 1963.63] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [59, 3191.2] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [46, 4675.07] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [46, 6065.16] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 7186.65] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [59, 7484.55] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [6, 2168.72] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [49, 3992.04] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [59, 6497.76] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [59, 9469.76] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 12084.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [51, 14038.5] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 15070.8] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [33, 3800.34] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [31, 6970.18] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [33, 11278.8] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [8, 16259.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [19, 20658.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [11, 24612.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 26264.9] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [36, 6458.57] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [21, 11061.9] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [59, 18391.0] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [41, 25383.2] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [41, 31530.0] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [11, 34229.0] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [11, 37142.6] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [29, 12690.8] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [27, 19917.5] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [29, 26832.9] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [15, 32077.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [59, 34264.7] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 37386.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [52, 37998.9] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [27, 18155.5] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [29, 25552.3] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [59, 31741.2] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [59, 34249.4] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [41, 37706.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [11, 39802.2] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [6, 40179.8] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [14, 1081.01] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [12, 1851.79] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [45, 2932.4] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [61, 4234.18] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [51, 5783.58] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [62, 6876.85] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [53, 6858.93] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [19, 2272.1] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [33, 3979.42] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [59, 6451.14] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 9027.29] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [45, 11830.9] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [46, 14157.5] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [33, 13835.5] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [31, 3762.28] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [62, 6763.21] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [19, 11234.7] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [33, 16220.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [18, 21014.2] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 24846.7] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [8, 26163.3] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [26, 6484.37] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [59, 11439.0] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [56, 18857.9] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [41, 26962.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [19, 31979.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 34351.7] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [11, 37319.0] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [27, 12532.8] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [62, 20006.6] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [28, 27122.0] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [27, 32482.5] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [62, 34321.7] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [59, 37290.1] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [50, 37953.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [62, 17898.9] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [29, 25610.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [27, 31815.2] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [59, 34524.0] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [8, 37561.0] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [11, 39738.6] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [41, 40287.8] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [34, 22070.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [37, 29375.8] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [59, 33306.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [59, 37297.4] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [62, 39605.9] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [59, 40255.3] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [52, 40215.5] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [7, 1872.46] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [32, 3337.65] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [16, 5451.87] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [7, 7943.76] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [30, 10278.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [30, 12300.4] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [30, 13004.6] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [1, 4681.16] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [33, 7596.14] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [11, 12038.3] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [8, 17121.1] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [7, 21645.7] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [18, 24801.6] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [25, 26489.9] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [56, 7951.29] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [31, 13061.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [31, 19438.7] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [11, 26281.2] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [41, 31973.1] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [11, 34560.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [41, 37295.3] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [15, 12526.5] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [36, 19512.2] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [27, 26250.3] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [62, 31807.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [52, 34137.6] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [56, 37148.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 37135.7] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [35, 18067.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [36, 25568.6] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [27, 31411.9] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [59, 34285.2] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [41, 37824.7] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [25, 39731.2] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [41, 40255.9] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [20, 21989.7] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [59, 29567.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [15, 33288.8] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [59, 37445.3] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [62, 39627.4] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [59, 40289.4] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [38, 40195.0] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [43, 25429.7] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [29, 31016.9] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [59, 35987.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [59, 39042.8] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [62, 40059.8] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [51, 40458.9] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [17, 40375.6] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [24, 2510.81] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [22, 4561.51] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [30, 7388.4] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [30, 10738.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [16, 13631.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 16088.9] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 17243.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [21, 5612.07] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [31, 9841.45] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [8, 15707.2] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [19, 22468.9] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 28579.8] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 33432.2] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [39, 34105.0] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [56, 10348.3] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [29, 16414.1] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [41, 23547.0] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [8, 30174.9] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [40, 34884.4] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [41, 37141.5] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [7, 39111.0] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [59, 14532.0] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [29, 21793.9] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [27, 28396.9] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [8, 33692.4] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [25, 36478.4] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [41, 39067.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [41, 40319.3] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [35, 19888.6] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [27, 27598.4] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [59, 33345.0] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [62, 35574.4] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [59, 38600.6] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [11, 40177.1] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [41, 40497.8] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [35, 23611.2] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [13, 30611.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [59, 34723.1] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [59, 38192.1] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [62, 40215.5] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [52, 40698.2] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [16, 40568.9] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [27, 24612.5] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [59, 30943.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [59, 34765.0] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [59, 39125.2] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [59, 40453.0] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [59, 40600.8] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [31, 40308.5] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB_GB.yaml +new file mode 100644 +index 00000000..efdb45a5 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HB_GB.yaml +@@ -0,0 +1,17853 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [3, 34.9153] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [11, 58.2736] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [33, 98.8013] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [45, 152.299] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [46, 214.598] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [46, 271.239] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [45, 301.196] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [9, 61.71] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [29, 112.412] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [11, 191.381] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [46, 298.633] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [51, 426.641] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [61, 540.086] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 602.122] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [29, 149.2] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [32, 268.212] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [46, 448.829] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [46, 665.763] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [45, 922.18] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 1138.1] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 1240.21] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [23, 329.223] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [46, 581.573] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [45, 963.322] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [52, 1406.19] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [46, 1925.32] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [61, 2332.03] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [52, 2525.07] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [46, 644.385] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [45, 1150.23] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [59, 1932.87] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [60, 2905.15] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [59, 3908.95] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [59, 4710.06] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [51, 5090.37] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [16, 1293.74] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [55, 2312.82] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [53, 3847.1] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [57, 5772.31] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [60, 7770.84] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [55, 9587.67] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [0, 8013.37] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [24, 2399.14] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [18, 4148.16] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [40, 6804.79] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [18, 10149.6] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [5, 13540.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [32, 16308.4] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [10, 17122.4] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [9, 70.167] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [15, 121.857] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [8, 197.882] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [8, 303.913] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [55, 424.934] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 539.305] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 604.454] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [27, 168.392] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [1, 289.183] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [19, 474.576] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [52, 700.57] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 957.057] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [48, 1167.11] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [62, 1248.14] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [62, 366.635] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [8, 653.726] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [59, 1055.57] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [52, 1546.0] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [46, 2044.51] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [45, 2397.95] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 2538.78] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [62, 776.867] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [56, 1360.69] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [46, 2213.94] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [49, 3190.8] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 4095.25] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [61, 4822.78] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [56, 5149.25] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [33, 1599.96] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [46, 2781.37] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [11, 4393.67] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [44, 6307.83] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [59, 8252.44] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [58, 9670.92] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 10377.1] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [59, 2794.81] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [52, 4949.03] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [59, 8096.18] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [50, 11933.7] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [47, 15958.4] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [58, 19716.9] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [47, 18578.1] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [25, 5083.23] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [19, 8912.2] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [8, 14500.7] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [30, 21445.7] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [11, 28067.3] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [24, 32858.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [33, 33553.4] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [29, 157.728] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [15, 274.892] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [46, 431.027] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [49, 661.562] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 920.914] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [61, 1146.42] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [46, 1233.98] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [19, 365.485] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [11, 653.522] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [59, 1058.5] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [46, 1543.3] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 2041.28] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [59, 2387.62] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [59, 2536.38] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [33, 787.663] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [4, 1365.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [52, 2210.15] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [59, 3105.76] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [52, 4094.0] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [56, 4833.37] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [52, 5127.12] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [11, 1600.88] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [59, 2655.04] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [46, 4520.35] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [59, 6483.34] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [59, 8388.61] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 9743.58] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [59, 10426.1] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [46, 2973.1] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [59, 5197.4] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [46, 8447.77] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [44, 12321.5] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [59, 16261.9] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 19341.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [46, 15925.5] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [31, 5006.65] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [31, 8861.6] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [19, 14450.7] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [19, 21171.7] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [11, 27292.5] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [8, 32448.2] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [19, 34710.7] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [46, 8869.79] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [59, 14643.0] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [25, 23511.9] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [25, 30297.5] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [7, 35413.7] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [7, 37152.4] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [11, 39461.3] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [27, 244.006] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [8, 399.865] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [59, 690.762] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [46, 1051.91] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 1401.06] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [59, 1707.84] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [60, 1858.73] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [8, 557.756] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [11, 999.278] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [59, 1553.64] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [46, 2289.47] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 3020.93] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [52, 3573.55] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 3783.91] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [33, 1127.91] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [56, 1963.63] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [46, 3210.75] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [46, 4681.59] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [46, 6143.62] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 7282.05] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [44, 7603.12] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [8, 2291.13] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [59, 3987.63] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [49, 6472.69] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [52, 9422.76] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 12321.1] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [49, 14284.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [62, 15235.6] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [31, 3940.16] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [33, 6951.91] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [11, 11229.7] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [8, 15833.8] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [41, 20747.9] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [40, 24599.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [11, 26273.9] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [59, 6458.57] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [36, 11091.2] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [59, 18470.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [31, 25392.8] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [41, 31675.0] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [8, 34049.6] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 37367.5] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [42, 12273.1] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [59, 19450.0] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [59, 26313.8] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [52, 31963.0] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [46, 33835.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [46, 36773.3] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 36908.1] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [2, 535.717] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [12, 918.863] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [45, 1455.34] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [55, 2118.34] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 2845.37] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [58, 3419.04] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 3699.5] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [33, 1096.65] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [33, 1963.63] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [59, 3191.2] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [46, 4675.07] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [46, 6065.16] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 7186.65] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [59, 7484.55] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [6, 2168.72] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [49, 3992.04] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [59, 6497.76] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [59, 9469.76] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 12084.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [51, 14038.5] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 15070.8] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [33, 3800.34] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [31, 6970.18] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [33, 11278.8] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [8, 16259.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [19, 20658.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [11, 24612.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 26264.9] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [36, 6458.57] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [21, 11061.9] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [59, 18391.0] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [41, 25383.2] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [41, 31530.0] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [11, 34229.0] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [11, 37142.6] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [29, 12690.8] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [27, 19917.5] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [29, 26832.9] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [15, 32077.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [59, 34264.7] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 37386.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [52, 37998.9] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [27, 18155.5] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [29, 25552.3] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [59, 31741.2] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [59, 34249.4] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [41, 37706.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [11, 39802.2] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [6, 40179.8] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [14, 1081.01] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [12, 1851.79] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [45, 2932.4] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [61, 4234.18] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [51, 5783.58] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [62, 6876.85] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [53, 6858.93] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [19, 2272.1] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [33, 3979.42] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [59, 6451.14] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 9027.29] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [45, 11830.9] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [46, 14157.5] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [33, 13835.5] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [31, 3762.28] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [62, 6763.21] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [19, 11234.7] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [33, 16220.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [18, 21014.2] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 24846.7] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [8, 26163.3] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [26, 6484.37] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [59, 11439.0] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [56, 18857.9] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [41, 26962.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [19, 31979.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 34351.7] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [11, 37319.0] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [27, 12532.8] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [62, 20006.6] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [28, 27122.0] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [27, 32482.5] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [62, 34321.7] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [59, 37290.1] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [50, 37953.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [62, 17898.9] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [29, 25610.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [27, 31815.2] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [59, 34524.0] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [8, 37561.0] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [11, 39738.6] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [41, 40287.8] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [34, 22070.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [37, 29375.8] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [59, 33306.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [59, 37297.4] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [62, 39605.9] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [59, 40255.3] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [52, 40215.5] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [7, 1872.46] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [32, 3337.65] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [16, 5451.87] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [7, 7943.76] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [30, 10278.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [30, 12300.4] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [30, 13004.6] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [1, 4681.16] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [33, 7596.14] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [11, 12038.3] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [8, 17121.1] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [7, 21645.7] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [18, 24801.6] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [25, 26489.9] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [56, 7951.29] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [31, 13061.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [31, 19438.7] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [11, 26281.2] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [41, 31973.1] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [11, 34560.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [41, 37295.3] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [15, 12526.5] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [36, 19512.2] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [27, 26250.3] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [62, 31807.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [52, 34137.6] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [56, 37148.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 37135.7] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [35, 18067.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [36, 25568.6] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [27, 31411.9] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [59, 34285.2] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [41, 37824.7] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [25, 39731.2] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [41, 40255.9] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [20, 21989.7] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [59, 29567.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [15, 33288.8] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [59, 37445.3] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [62, 39627.4] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [59, 40289.4] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [38, 40195.0] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [43, 25429.7] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [29, 31016.9] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [59, 35987.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [59, 39042.8] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [62, 40059.8] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [51, 40458.9] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [17, 40375.6] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [24, 2510.81] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [22, 4561.51] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [30, 7388.4] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [30, 10738.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [16, 13631.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 16088.9] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 17243.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [21, 5612.07] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [31, 9841.45] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [8, 15707.2] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [19, 22468.9] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 28579.8] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 33432.2] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [39, 34105.0] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [56, 10348.3] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [29, 16414.1] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [41, 23547.0] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [8, 30174.9] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [40, 34884.4] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [41, 37141.5] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [7, 39111.0] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [59, 14532.0] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [29, 21793.9] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [27, 28396.9] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [8, 33692.4] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [25, 36478.4] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [41, 39067.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [41, 40319.3] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [35, 19888.6] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [27, 27598.4] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [59, 33345.0] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [62, 35574.4] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [59, 38600.6] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [11, 40177.1] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [41, 40497.8] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [35, 23611.2] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [13, 30611.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [59, 34723.1] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [59, 38192.1] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [62, 40215.5] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [52, 40698.2] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [16, 40568.9] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [27, 24612.5] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [59, 30943.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [59, 34765.0] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [59, 39125.2] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [59, 40453.0] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [59, 40600.8] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [31, 40308.5] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH.yaml +new file mode 100644 +index 00000000..fac52d82 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH.yaml +@@ -0,0 +1,8943 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [0, 33.3475] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [1, 60.2562] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [19, 94.8167] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [28, 147.086] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 210.168] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [27, 261.581] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [26, 295.415] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [1, 61.3707] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [1, 111.397] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [19, 189.736] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [27, 295.728] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 414.929] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [23, 525.933] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [27, 585.808] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [1, 134.45] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [18, 242.98] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [24, 410.563] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [27, 635.406] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [27, 891.082] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [24, 1097.41] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [29, 1205.26] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [15, 302.359] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [24, 542.04] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [24, 912.8] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [24, 1372.82] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [29, 1876.75] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [29, 2242.72] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [29, 2438.11] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [19, 629.397] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [27, 1132.07] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [29, 1885.08] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [24, 2826.35] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 3808.89] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [29, 4513.64] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [29, 4923.25] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [19, 1215.39] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [25, 2175.47] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [26, 3627.9] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [26, 5468.9] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [28, 7433.39] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [28, 9170.99] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [3, 8846.25] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [2, 2195.68] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [5, 3933.22] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [11, 6499.63] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [18, 9777.63] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [8, 13251.5] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [18, 15685.1] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [5, 16704.3] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [16, 67.2941] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [7, 124.401] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [0, 196.068] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [19, 286.868] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [29, 413.007] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 530.017] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 585.059] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [15, 152.034] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [19, 277.18] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [19, 456.05] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [5, 674.052] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [27, 924.666] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [24, 1123.72] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [29, 1216.84] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [1, 343.736] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [11, 604.539] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [29, 981.35] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [29, 1454.71] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [27, 1948.45] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [29, 2327.01] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 2473.28] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [3, 722.9] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [19, 1272.74] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [29, 2066.92] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [22, 3012.59] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 3988.87] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 4769.28] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [29, 4977.46] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [11, 1472.71] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [24, 2582.68] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [29, 4210.08] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [24, 6137.6] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 8046.12] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [24, 9449.25] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [29, 10159.3] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [19, 2906.64] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [19, 5100.23] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [27, 8268.71] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [29, 11997.6] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [27, 15820.9] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 18992.7] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [11, 18019.1] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [19, 4825.2] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [19, 8484.03] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [11, 13846.8] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [5, 20582.3] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [5, 27091.3] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [17, 32514.8] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [5, 33128.8] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [0, 148.04] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [16, 271.933] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [0, 428.383] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [24, 635.018] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 886.553] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [28, 1096.72] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [27, 1208.16] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [12, 348.246] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [11, 634.342] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [11, 982.96] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [29, 1448.43] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [24, 1926.75] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [27, 2324.75] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [29, 2466.64] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [19, 722.903] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [9, 1261.63] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [24, 2152.84] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [27, 3097.41] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [27, 4004.08] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [27, 4718.31] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [29, 4981.15] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [11, 1478.42] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [19, 2574.75] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [27, 4185.91] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [29, 6097.44] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 8020.15] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [29, 9391.05] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [29, 10014.5] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [19, 2914.71] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [19, 5103.34] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [29, 7932.44] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [24, 12016.9] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 15848.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [29, 18661.3] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [27, 18776.9] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [5, 4737.29] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [19, 8492.59] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [9, 14290.6] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [11, 20837.9] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [5, 27317.4] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [5, 32149.7] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [3, 33865.9] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [15, 8499.07] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [24, 14293.6] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [19, 22544.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [17, 29125.4] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [17, 34461.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [3, 36147.9] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [3, 38170.8] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [1, 240.426] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [15, 435.935] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [10, 679.862] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [27, 998.719] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 1368.37] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 1659.57] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [26, 1798.48] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [19, 538.652] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [11, 980.589] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [29, 1562.69] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [29, 2290.29] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 2974.84] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 3473.77] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [27, 3719.3] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [19, 1157.36] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [3, 2009.08] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [24, 3112.63] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [29, 4552.4] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 5973.69] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [29, 7023.88] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [29, 7430.08] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [11, 2239.34] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [5, 3895.61] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [27, 6304.02] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [24, 9187.06] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [27, 11949.5] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 13987.7] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [24, 14798.4] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [9, 3709.02] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [5, 6579.24] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [9, 10616.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [3, 15411.9] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [5, 20280.6] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [3, 23975.9] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [19, 25768.0] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [19, 6402.68] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [19, 10935.7] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [11, 18285.7] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [19, 25383.0] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [19, 31209.6] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [11, 33705.2] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [19, 36970.6] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [15, 12407.6] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [6, 18847.1] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [24, 25508.5] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [29, 31019.7] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [29, 32931.3] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [27, 35585.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [29, 36275.9] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [6, 500.191] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [7, 904.588] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [9, 1407.95] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [29, 2078.95] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [24, 2727.69] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [23, 3304.43] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 3579.76] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [19, 1064.72] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [11, 1926.64] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [24, 3102.28] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [24, 4544.59] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [24, 5996.12] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [24, 7004.09] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [29, 7321.33] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [19, 2131.25] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [19, 3751.61] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [29, 6078.68] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [24, 9161.14] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [27, 11783.8] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [29, 13862.5] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [27, 14794.9] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [3, 3885.4] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [5, 6835.71] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [5, 11035.2] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [19, 15944.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [11, 20272.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [19, 24107.3] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [5, 25649.4] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [15, 6407.57] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [7, 10938.0] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [9, 18267.5] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [17, 25837.5] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [19, 31605.2] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [5, 33986.9] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [11, 36786.5] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [16, 12255.0] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [16, 18944.8] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [19, 26147.8] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [5, 31502.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [29, 33500.5] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [29, 36337.9] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [29, 37489.7] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [21, 16901.1] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [16, 24224.0] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [16, 30558.3] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [29, 33524.2] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [29, 36968.0] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [29, 38855.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [3, 39145.7] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [5, 959.499] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [1, 1739.4] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [9, 2761.83] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [25, 4153.09] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [24, 5531.84] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 6735.99] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [27, 6695.34] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [19, 2156.07] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [11, 3877.6] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [11, 6205.34] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [24, 9034.54] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [24, 11692.1] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 13805.5] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [23, 12360.0] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [29, 3770.15] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [3, 6807.05] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [5, 10966.6] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [19, 15902.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [5, 20509.9] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 24440.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [5, 25719.9] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [29, 6648.81] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [19, 12339.1] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [5, 18775.1] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [5, 25721.9] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [17, 31457.1] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 33933.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [9, 36789.4] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [16, 12281.9] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [16, 19119.2] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [16, 25930.7] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [29, 31323.7] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [29, 33901.0] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 36396.2] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [27, 37584.2] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [15, 16976.6] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [15, 24325.0] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [29, 30859.2] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [29, 33834.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [29, 37161.2] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 38910.3] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 39133.9] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [21, 21167.6] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [16, 28283.0] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [16, 32543.4] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [29, 36737.0] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [29, 39192.2] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [29, 39767.9] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 39218.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [18, 1860.54] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [10, 3224.71] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [10, 5030.61] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [10, 7427.89] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [2, 9882.95] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [4, 11801.7] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 12723.2] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [5, 4342.66] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [19, 7119.02] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [5, 11401.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [5, 16782.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [5, 21243.6] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [5, 24716.9] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [11, 25831.7] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [5, 7827.6] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [11, 12872.5] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [9, 19403.0] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [9, 26224.5] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [5, 31837.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [19, 34350.1] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [5, 37051.4] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [15, 12697.1] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [14, 19472.4] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [15, 26275.9] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [9, 31548.3] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [29, 33573.8] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [29, 36265.5] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [10, 36745.4] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [20, 16943.8] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [15, 24218.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [16, 30664.2] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [29, 33449.0] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [29, 37220.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [29, 38894.3] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 39112.3] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [21, 21172.1] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [16, 28450.9] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [29, 32295.4] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [29, 36761.3] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [29, 39147.5] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [29, 39866.8] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [19, 39321.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [12, 24300.0] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [15, 29880.3] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [15, 35099.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [29, 38357.6] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [29, 39665.7] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [27, 39551.4] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [29, 39057.0] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [18, 1941.12] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [18, 3898.94] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [8, 6458.35] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [2, 9728.69] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 13058.7] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [10, 15758.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 16838.4] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [19, 5578.46] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [5, 10244.0] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [19, 16097.0] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [5, 22698.7] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [11, 28528.0] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [5, 32699.0] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 33836.6] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [16, 10047.7] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [19, 16085.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [9, 22574.6] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [5, 29203.0] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [17, 34067.4] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [9, 35986.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 38270.9] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [16, 14686.2] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [7, 21709.2] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [19, 28392.2] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [19, 33614.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [19, 35320.3] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [9, 37922.5] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 39119.3] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [13, 18758.5] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [16, 26225.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [16, 32201.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [29, 35193.0] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [29, 38060.3] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [29, 39637.9] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 39003.6] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [21, 22616.4] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [16, 29748.3] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [29, 33752.1] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [29, 37550.1] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [29, 39729.6] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [27, 40134.6] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 39859.9] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [21, 23473.8] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [19, 30001.5] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [29, 34046.1] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [29, 38624.9] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [16, 39558.4] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [27, 39854.3] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 38537.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml +new file mode 100644 +index 00000000..205b2f7e +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml +@@ -0,0 +1,8943 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 2 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 128 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [0, 33.3475] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [1, 60.2562] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [19, 94.8167] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [28, 147.086] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 210.168] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [27, 261.581] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [26, 295.415] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [1, 61.3707] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [1, 111.397] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [19, 189.736] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [27, 295.728] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 414.929] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [23, 525.933] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [27, 585.808] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [1, 134.45] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [18, 242.98] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [24, 410.563] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [27, 635.406] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [27, 891.082] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [24, 1097.41] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [29, 1205.26] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [15, 302.359] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [24, 542.04] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [24, 912.8] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [24, 1372.82] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [29, 1876.75] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [29, 2242.72] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [29, 2438.11] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [19, 629.397] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [27, 1132.07] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [29, 1885.08] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [24, 2826.35] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 3808.89] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [29, 4513.64] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [29, 4923.25] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [19, 1215.39] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [25, 2175.47] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [26, 3627.9] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [26, 5468.9] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [28, 7433.39] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [28, 9170.99] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [3, 8846.25] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [2, 2195.68] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [5, 3933.22] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [11, 6499.63] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [18, 9777.63] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [8, 13251.5] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [18, 15685.1] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [5, 16704.3] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [16, 67.2941] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [7, 124.401] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [0, 196.068] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [19, 286.868] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [29, 413.007] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 530.017] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 585.059] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [15, 152.034] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [19, 277.18] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [19, 456.05] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [5, 674.052] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [27, 924.666] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [24, 1123.72] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [29, 1216.84] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [1, 343.736] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [11, 604.539] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [29, 981.35] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [29, 1454.71] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [27, 1948.45] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [29, 2327.01] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 2473.28] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [3, 722.9] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [19, 1272.74] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [29, 2066.92] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [22, 3012.59] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 3988.87] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 4769.28] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [29, 4977.46] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [11, 1472.71] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [24, 2582.68] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [29, 4210.08] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [24, 6137.6] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 8046.12] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [24, 9449.25] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [29, 10159.3] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [19, 2906.64] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [19, 5100.23] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [27, 8268.71] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [29, 11997.6] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [27, 15820.9] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 18992.7] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [11, 18019.1] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [19, 4825.2] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [19, 8484.03] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [11, 13846.8] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [5, 20582.3] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [5, 27091.3] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [17, 32514.8] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [5, 33128.8] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [0, 148.04] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [16, 271.933] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [0, 428.383] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [24, 635.018] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 886.553] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [28, 1096.72] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [27, 1208.16] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [12, 348.246] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [11, 634.342] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [11, 982.96] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [29, 1448.43] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [24, 1926.75] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [27, 2324.75] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [29, 2466.64] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [19, 722.903] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [9, 1261.63] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [24, 2152.84] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [27, 3097.41] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [27, 4004.08] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [27, 4718.31] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [29, 4981.15] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [11, 1478.42] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [19, 2574.75] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [27, 4185.91] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [29, 6097.44] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 8020.15] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [29, 9391.05] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [29, 10014.5] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [19, 2914.71] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [19, 5103.34] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [29, 7932.44] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [24, 12016.9] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 15848.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [29, 18661.3] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [27, 18776.9] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [5, 4737.29] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [19, 8492.59] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [9, 14290.6] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [11, 20837.9] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [5, 27317.4] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [5, 32149.7] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [3, 33865.9] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [15, 8499.07] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [24, 14293.6] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [19, 22544.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [17, 29125.4] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [17, 34461.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [3, 36147.9] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [3, 38170.8] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [1, 240.426] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [15, 435.935] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [10, 679.862] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [27, 998.719] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 1368.37] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 1659.57] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [26, 1798.48] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [19, 538.652] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [11, 980.589] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [29, 1562.69] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [29, 2290.29] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 2974.84] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 3473.77] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [27, 3719.3] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [19, 1157.36] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [3, 2009.08] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [24, 3112.63] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [29, 4552.4] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 5973.69] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [29, 7023.88] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [29, 7430.08] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [11, 2239.34] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [5, 3895.61] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [27, 6304.02] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [24, 9187.06] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [27, 11949.5] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 13987.7] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [24, 14798.4] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [9, 3709.02] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [5, 6579.24] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [9, 10616.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [3, 15411.9] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [5, 20280.6] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [3, 23975.9] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [19, 25768.0] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [19, 6402.68] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [19, 10935.7] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [11, 18285.7] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [19, 25383.0] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [19, 31209.6] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [11, 33705.2] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [19, 36970.6] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [15, 12407.6] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [6, 18847.1] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [24, 25508.5] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [29, 31019.7] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [29, 32931.3] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [27, 35585.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [29, 36275.9] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [6, 500.191] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [7, 904.588] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [9, 1407.95] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [29, 2078.95] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [24, 2727.69] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [23, 3304.43] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 3579.76] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [19, 1064.72] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [11, 1926.64] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [24, 3102.28] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [24, 4544.59] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [24, 5996.12] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [24, 7004.09] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [29, 7321.33] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [19, 2131.25] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [19, 3751.61] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [29, 6078.68] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [24, 9161.14] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [27, 11783.8] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [29, 13862.5] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [27, 14794.9] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [3, 3885.4] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [5, 6835.71] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [5, 11035.2] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [19, 15944.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [11, 20272.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [19, 24107.3] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [5, 25649.4] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [15, 6407.57] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [7, 10938.0] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [9, 18267.5] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [17, 25837.5] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [19, 31605.2] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [5, 33986.9] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [11, 36786.5] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [16, 12255.0] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [16, 18944.8] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [19, 26147.8] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [5, 31502.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [29, 33500.5] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [29, 36337.9] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [29, 37489.7] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [21, 16901.1] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [16, 24224.0] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [16, 30558.3] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [29, 33524.2] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [29, 36968.0] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [29, 38855.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [3, 39145.7] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [5, 959.499] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [1, 1739.4] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [9, 2761.83] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [25, 4153.09] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [24, 5531.84] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 6735.99] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [27, 6695.34] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [19, 2156.07] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [11, 3877.6] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [11, 6205.34] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [24, 9034.54] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [24, 11692.1] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 13805.5] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [23, 12360.0] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [29, 3770.15] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [3, 6807.05] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [5, 10966.6] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [19, 15902.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [5, 20509.9] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 24440.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [5, 25719.9] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [29, 6648.81] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [19, 12339.1] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [5, 18775.1] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [5, 25721.9] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [17, 31457.1] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 33933.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [9, 36789.4] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [16, 12281.9] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [16, 19119.2] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [16, 25930.7] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [29, 31323.7] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [29, 33901.0] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 36396.2] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [27, 37584.2] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [15, 16976.6] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [15, 24325.0] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [29, 30859.2] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [29, 33834.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [29, 37161.2] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 38910.3] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 39133.9] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [21, 21167.6] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [16, 28283.0] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [16, 32543.4] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [29, 36737.0] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [29, 39192.2] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [29, 39767.9] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 39218.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [18, 1860.54] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [10, 3224.71] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [10, 5030.61] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [10, 7427.89] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [2, 9882.95] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [4, 11801.7] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 12723.2] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [5, 4342.66] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [19, 7119.02] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [5, 11401.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [5, 16782.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [5, 21243.6] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [5, 24716.9] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [11, 25831.7] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [5, 7827.6] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [11, 12872.5] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [9, 19403.0] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [9, 26224.5] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [5, 31837.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [19, 34350.1] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [5, 37051.4] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [15, 12697.1] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [14, 19472.4] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [15, 26275.9] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [9, 31548.3] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [29, 33573.8] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [29, 36265.5] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [10, 36745.4] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [20, 16943.8] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [15, 24218.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [16, 30664.2] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [29, 33449.0] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [29, 37220.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [29, 38894.3] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 39112.3] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [21, 21172.1] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [16, 28450.9] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [29, 32295.4] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [29, 36761.3] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [29, 39147.5] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [29, 39866.8] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [19, 39321.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [12, 24300.0] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [15, 29880.3] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [15, 35099.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [29, 38357.6] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [29, 39665.7] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [27, 39551.4] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [29, 39057.0] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [18, 1941.12] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [18, 3898.94] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [8, 6458.35] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [2, 9728.69] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 13058.7] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [10, 15758.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 16838.4] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [19, 5578.46] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [5, 10244.0] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [19, 16097.0] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [5, 22698.7] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [11, 28528.0] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [5, 32699.0] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 33836.6] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [16, 10047.7] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [19, 16085.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [9, 22574.6] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [5, 29203.0] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [17, 34067.4] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [9, 35986.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 38270.9] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [16, 14686.2] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [7, 21709.2] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [19, 28392.2] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [19, 33614.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [19, 35320.3] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [9, 37922.5] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 39119.3] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [13, 18758.5] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [16, 26225.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [16, 32201.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [29, 35193.0] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [29, 38060.3] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [29, 39637.9] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 39003.6] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [21, 22616.4] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [16, 29748.3] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [29, 33752.1] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [29, 37550.1] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [29, 39729.6] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [27, 40134.6] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 39859.9] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [21, 23473.8] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [19, 30001.5] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [29, 34046.1] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [29, 38624.9] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [16, 39558.4] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [27, 39854.3] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 38537.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH.yaml +new file mode 100644 +index 00000000..882951ac +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH.yaml +@@ -0,0 +1,22983 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [3, 34.2627] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [33, 56.9632] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [38, 96.8661] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [66, 147.604] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 210.685] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 263.884] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [60, 294.901] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [5, 60.4576] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [19, 110.121] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [38, 187.765] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [51, 290.083] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [67, 413.477] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [55, 526.708] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [72, 588.406] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [4, 140.053] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [45, 251.668] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [6, 421.92] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [49, 653.981] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 900.114] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [78, 1115.8] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [67, 1218.28] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [24, 303.671] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [45, 541.69] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [48, 907.465] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [48, 1426.15] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [77, 1907.15] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [78, 2284.64] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 2480.46] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [57, 659.378] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [50, 1177.85] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [48, 1952.89] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [77, 2927.71] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [68, 3873.32] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 4632.03] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 4951.87] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [48, 1203.36] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [73, 2170.68] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [61, 3646.03] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [68, 5522.91] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [77, 7434.25] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [48, 9040.37] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 9846.33] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [24, 2065.4] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [6, 3749.1] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [6, 6298.94] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [6, 9532.51] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [37, 12900.6] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [24, 15701.2] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 17171.6] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [29, 62.2448] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [4, 113.176] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [38, 187.413] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [9, 289.783] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 413.313] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 525.372] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 588.355] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [12, 160.751] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [29, 290.464] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [27, 472.971] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [40, 689.684] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [62, 947.705] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 1132.03] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 1226.23] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [41, 373.624] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [7, 653.93] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [12, 1044.79] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [67, 1531.47] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 1961.56] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 2342.21] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [78, 2514.9] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [29, 771.867] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [79, 1348.43] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [0, 2110.61] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [78, 3077.83] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 4005.55] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [78, 4763.89] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [48, 5050.15] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [76, 1458.63] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [51, 2561.41] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [78, 4183.32] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [49, 6150.57] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [78, 7856.81] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [72, 9498.45] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 10006.9] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [55, 2468.69] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [62, 4447.24] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [69, 7475.67] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [49, 11541.6] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [78, 15324.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 18241.7] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 19964.0] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [36, 4927.95] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [18, 7467.35] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [48, 14361.0] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [78, 22727.6] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [72, 29735.3] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [60, 34877.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [72, 35003.8] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [20, 155.322] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [38, 260.225] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [78, 432.313] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [55, 649.223] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [79, 902.196] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [48, 1119.19] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 1206.43] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [42, 354.069] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [10, 621.931] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [74, 1039.48] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [61, 1520.78] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [49, 1985.35] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [60, 2329.93] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 2491.88] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [22, 770.87] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [23, 1347.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [61, 2169.0] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [77, 3158.07] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [66, 4069.42] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [60, 4735.82] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 5039.53] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [64, 1438.62] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [73, 2554.0] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [73, 4167.23] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [59, 6116.93] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [60, 7836.62] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [66, 9469.97] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [60, 9954.22] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [61, 2458.56] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [61, 4421.46] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [61, 7397.36] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [60, 11170.8] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [78, 15163.3] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [50, 18143.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 19584.5] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [63, 4585.19] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [80, 7421.08] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [61, 14439.8] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [48, 21096.8] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [71, 27582.8] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [48, 33168.9] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [59, 35456.9] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [79, 9041.88] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [50, 14906.5] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [73, 21718.1] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [26, 27834.5] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [9, 33051.5] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [8, 35593.3] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [26, 38502.0] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [13, 242.951] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [14, 415.555] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [49, 654.542] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [54, 1028.44] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [66, 1391.84] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [55, 1679.07] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 1821.93] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [18, 571.327] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [27, 993.601] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [51, 1605.78] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [49, 2332.11] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [78, 2963.82] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [59, 3521.92] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [59, 3760.87] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [74, 1121.87] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [2, 1932.27] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [55, 3073.5] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [67, 4538.47] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [49, 5935.69] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [71, 7048.27] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [60, 7494.44] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [61, 2006.84] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [49, 3571.65] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [55, 5917.19] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [49, 8599.29] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [77, 11625.3] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [60, 13929.3] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [72, 14765.4] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [11, 3747.14] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [73, 6217.64] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [79, 11614.5] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [44, 16918.2] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [33, 21694.7] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [48, 25704.9] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [59, 27010.1] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [73, 7976.52] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [24, 12928.8] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [38, 19393.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [37, 26301.7] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [38, 32067.3] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [40, 34583.3] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [8, 37104.1] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [81, 12179.5] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [40, 18852.6] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [73, 25306.6] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [60, 30666.7] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [50, 33311.6] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [48, 35958.0] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [68, 36635.3] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [13, 505.176] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [14, 863.499] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [48, 1380.46] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [48, 2091.06] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [78, 2849.71] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [60, 3433.27] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [59, 3661.69] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [62, 1110.39] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [56, 1951.44] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [62, 3175.1] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [72, 4628.2] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [49, 6002.59] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [72, 7020.5] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [72, 7485.8] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [73, 2001.42] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [51, 3542.99] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [56, 5872.32] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [50, 8530.79] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [48, 11401.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [48, 13724.2] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [55, 15041.8] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [2, 3748.82] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [39, 6111.92] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [24, 10137.3] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [54, 15728.6] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 20967.2] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [49, 25606.8] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 26324.1] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [77, 7600.7] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [39, 12523.4] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [24, 19886.1] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [24, 26775.8] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [17, 32280.7] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [39, 34310.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [26, 37191.5] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [77, 11795.6] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [73, 19079.5] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [73, 25839.3] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [59, 31127.8] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [2, 33297.1] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [71, 36502.0] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [71, 38152.0] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [81, 16746.5] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [61, 24034.8] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [71, 30200.9] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [26, 33339.1] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [26, 36892.0] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [26, 38878.4] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [40, 39582.1] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [2, 947.081] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [50, 1670.37] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [54, 2804.93] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [59, 4234.18] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [77, 5631.51] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [71, 6717.16] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [77, 7358.7] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [9, 1962.09] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [10, 3492.83] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [61, 5653.97] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [78, 8494.09] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [48, 11357.7] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [78, 13691.0] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [78, 14899.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [2, 3742.69] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [26, 6331.82] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [39, 10391.6] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [28, 16107.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [48, 21539.2] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [26, 24863.6] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [31, 26156.5] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [73, 8018.46] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [61, 13105.5] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [24, 19872.4] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [37, 26899.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [8, 31927.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [37, 34203.6] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [8, 37215.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [79, 11781.8] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [79, 19117.6] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [46, 25847.6] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [21, 30898.5] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [34, 33435.4] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [71, 36434.3] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [71, 38210.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [75, 16816.5] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [59, 24024.7] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [61, 30174.8] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [73, 33507.0] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [27, 36943.2] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 38954.3] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [9, 39796.6] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [75, 21348.5] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [73, 28464.1] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [46, 32408.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [39, 36787.1] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [26, 39067.0] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [71, 39153.4] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [8, 39715.5] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [4, 1768.51] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [45, 3140.23] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [24, 5187.77] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [6, 7705.4] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [37, 10029.2] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [26, 11978.4] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [17, 12881.2] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [30, 3765.65] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [18, 6083.11] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [27, 9996.38] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [46, 16366.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 21266.2] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 24924.4] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [32, 26196.5] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [61, 7920.04] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [39, 13041.0] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [25, 19850.8] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [38, 26351.6] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [16, 31923.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [37, 34469.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [8, 37234.0] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [64, 12213.5] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [68, 19081.3] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [15, 25816.1] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [61, 30738.1] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [79, 33456.2] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 36115.7] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 37885.0] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [64, 16731.2] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [71, 24119.7] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [79, 30248.5] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [47, 33607.0] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 36806.5] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 38860.1] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 39798.1] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [75, 21318.0] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [77, 28415.9] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [39, 32521.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [26, 36773.7] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [40, 39002.4] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 39209.6] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [26, 39950.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [53, 6862.26] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [53, 12876.3] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [53, 23076.2] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [40, 34836.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [25, 38469.0] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [39, 40068.0] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [26, 40246.6] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [43, 2248.66] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [0, 4086.52] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [77, 6752.09] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [77, 10108.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [6, 13353.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [6, 15962.6] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [45, 16985.0] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [35, 5693.95] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [1, 8636.95] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [26, 14180.4] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [61, 21198.4] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [47, 28723.5] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 33544.0] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 34958.8] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [77, 9707.63] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [50, 15822.0] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [61, 22295.3] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [25, 28417.9] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 33036.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [26, 35523.8] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [9, 38749.7] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [70, 13761.7] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [50, 20922.5] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [61, 27401.2] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [59, 32510.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [39, 35154.5] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 37819.2] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [27, 39756.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [75, 18664.7] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [71, 26076.9] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [73, 31845.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [46, 35125.2] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 37834.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [39, 39550.7] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [26, 40028.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [76, 22394.8] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [73, 29770.8] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [46, 33621.7] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [46, 37010.3] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [47, 38908.1] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [26, 39496.0] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 39951.5] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [58, 7088.68] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [52, 13399.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [65, 24382.5] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [38, 36186.0] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [40, 38939.5] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [46, 39122.0] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [26, 40501.1] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml +new file mode 100644 +index 00000000..0dc8fab9 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml +@@ -0,0 +1,22983 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8192 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4096 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 4096 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 6144 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 16384 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 12288 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: false ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [3, 34.2627] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [33, 56.9632] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [38, 96.8661] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [66, 147.604] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 210.685] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 263.884] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [60, 294.901] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [5, 60.4576] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [19, 110.121] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [38, 187.765] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [51, 290.083] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [67, 413.477] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [55, 526.708] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [72, 588.406] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [4, 140.053] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [45, 251.668] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [6, 421.92] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [49, 653.981] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 900.114] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [78, 1115.8] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [67, 1218.28] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [24, 303.671] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [45, 541.69] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [48, 907.465] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [48, 1426.15] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [77, 1907.15] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [78, 2284.64] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 2480.46] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [57, 659.378] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [50, 1177.85] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [48, 1952.89] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [77, 2927.71] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [68, 3873.32] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 4632.03] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 4951.87] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [48, 1203.36] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [73, 2170.68] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [61, 3646.03] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [68, 5522.91] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [77, 7434.25] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [48, 9040.37] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 9846.33] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [24, 2065.4] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [6, 3749.1] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [6, 6298.94] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [6, 9532.51] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [37, 12900.6] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [24, 15701.2] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 17171.6] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [29, 62.2448] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [4, 113.176] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [38, 187.413] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [9, 289.783] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 413.313] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 525.372] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 588.355] ++ - - [128, 128, 1, 64, 160, 160, 128, 128] ++ - [12, 160.751] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [29, 290.464] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [27, 472.971] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [40, 689.684] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [62, 947.705] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 1132.03] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 1226.23] ++ - - [128, 256, 1, 64, 160, 160, 128, 256] ++ - [41, 373.624] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [7, 653.93] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [12, 1044.79] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [67, 1531.47] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 1961.56] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 2342.21] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [78, 2514.9] ++ - - [128, 512, 1, 64, 160, 160, 128, 512] ++ - [29, 771.867] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [79, 1348.43] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [0, 2110.61] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [78, 3077.83] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 4005.55] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [78, 4763.89] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [48, 5050.15] ++ - - [128, 1024, 1, 64, 160, 160, 128, 1024] ++ - [76, 1458.63] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [51, 2561.41] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [78, 4183.32] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [49, 6150.57] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [78, 7856.81] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [72, 9498.45] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 10006.9] ++ - - [128, 2048, 1, 64, 160, 160, 128, 2048] ++ - [55, 2468.69] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [62, 4447.24] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [69, 7475.67] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [49, 11541.6] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [78, 15324.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 18241.7] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 19964.0] ++ - - [128, 4096, 1, 64, 160, 160, 128, 4096] ++ - [36, 4927.95] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [18, 7467.35] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [48, 14361.0] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [78, 22727.6] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [72, 29735.3] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [60, 34877.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [72, 35003.8] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [20, 155.322] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [38, 260.225] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [78, 432.313] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [55, 649.223] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [79, 902.196] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [48, 1119.19] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 1206.43] ++ - - [256, 128, 1, 64, 288, 288, 256, 128] ++ - [42, 354.069] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [10, 621.931] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [74, 1039.48] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [61, 1520.78] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [49, 1985.35] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [60, 2329.93] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 2491.88] ++ - - [256, 256, 1, 64, 288, 288, 256, 256] ++ - [22, 770.87] ++ - - [256, 256, 1, 128, 288, 288, 256, 256] ++ - [23, 1347.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [61, 2169.0] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [77, 3158.07] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [66, 4069.42] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [60, 4735.82] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 5039.53] ++ - - [256, 512, 1, 64, 288, 288, 256, 512] ++ - [64, 1438.62] ++ - - [256, 512, 1, 128, 288, 288, 256, 512] ++ - [73, 2554.0] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [73, 4167.23] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [59, 6116.93] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [60, 7836.62] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [66, 9469.97] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [60, 9954.22] ++ - - [256, 1024, 1, 64, 288, 288, 256, 1024] ++ - [61, 2458.56] ++ - - [256, 1024, 1, 128, 288, 288, 256, 1024] ++ - [61, 4421.46] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [61, 7397.36] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [60, 11170.8] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [78, 15163.3] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [50, 18143.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 19584.5] ++ - - [256, 2048, 1, 64, 288, 288, 256, 2048] ++ - [63, 4585.19] ++ - - [256, 2048, 1, 128, 288, 288, 256, 2048] ++ - [80, 7421.08] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [61, 14439.8] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [48, 21096.8] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [71, 27582.8] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [48, 33168.9] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [59, 35456.9] ++ - - [256, 4096, 1, 64, 288, 288, 256, 4096] ++ - [79, 9041.88] ++ - - [256, 4096, 1, 128, 288, 288, 256, 4096] ++ - [50, 14906.5] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [73, 21718.1] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [26, 27834.5] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [9, 33051.5] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [8, 35593.3] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [26, 38502.0] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [13, 242.951] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [14, 415.555] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [49, 654.542] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [54, 1028.44] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [66, 1391.84] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [55, 1679.07] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 1821.93] ++ - - [384, 128, 1, 64, 416, 416, 384, 128] ++ - [18, 571.327] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [27, 993.601] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [51, 1605.78] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [49, 2332.11] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [78, 2963.82] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [59, 3521.92] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [59, 3760.87] ++ - - [384, 256, 1, 64, 416, 416, 384, 256] ++ - [74, 1121.87] ++ - - [384, 256, 1, 128, 416, 416, 384, 256] ++ - [2, 1932.27] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [55, 3073.5] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [67, 4538.47] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [49, 5935.69] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [71, 7048.27] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [60, 7494.44] ++ - - [384, 512, 1, 64, 416, 416, 384, 512] ++ - [61, 2006.84] ++ - - [384, 512, 1, 128, 416, 416, 384, 512] ++ - [49, 3571.65] ++ - - [384, 512, 1, 256, 416, 416, 384, 512] ++ - [55, 5917.19] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [49, 8599.29] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [77, 11625.3] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [60, 13929.3] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [72, 14765.4] ++ - - [384, 1024, 1, 64, 416, 416, 384, 1024] ++ - [11, 3747.14] ++ - - [384, 1024, 1, 128, 416, 416, 384, 1024] ++ - [73, 6217.64] ++ - - [384, 1024, 1, 256, 416, 416, 384, 1024] ++ - [79, 11614.5] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [44, 16918.2] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [33, 21694.7] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [48, 25704.9] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [59, 27010.1] ++ - - [384, 2048, 1, 64, 416, 416, 384, 2048] ++ - [73, 7976.52] ++ - - [384, 2048, 1, 128, 416, 416, 384, 2048] ++ - [24, 12928.8] ++ - - [384, 2048, 1, 256, 416, 416, 384, 2048] ++ - [38, 19393.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [37, 26301.7] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [38, 32067.3] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [40, 34583.3] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [8, 37104.1] ++ - - [384, 4096, 1, 64, 416, 416, 384, 4096] ++ - [81, 12179.5] ++ - - [384, 4096, 1, 128, 416, 416, 384, 4096] ++ - [40, 18852.6] ++ - - [384, 4096, 1, 256, 416, 416, 384, 4096] ++ - [73, 25306.6] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [60, 30666.7] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [50, 33311.6] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [48, 35958.0] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [68, 36635.3] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [13, 505.176] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [14, 863.499] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [48, 1380.46] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [48, 2091.06] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [78, 2849.71] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [60, 3433.27] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [59, 3661.69] ++ - - [768, 128, 1, 64, 800, 800, 768, 128] ++ - [62, 1110.39] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [56, 1951.44] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [62, 3175.1] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [72, 4628.2] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [49, 6002.59] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [72, 7020.5] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [72, 7485.8] ++ - - [768, 256, 1, 64, 800, 800, 768, 256] ++ - [73, 2001.42] ++ - - [768, 256, 1, 128, 800, 800, 768, 256] ++ - [51, 3542.99] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [56, 5872.32] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [50, 8530.79] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [48, 11401.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [48, 13724.2] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [55, 15041.8] ++ - - [768, 512, 1, 64, 800, 800, 768, 512] ++ - [2, 3748.82] ++ - - [768, 512, 1, 128, 800, 800, 768, 512] ++ - [39, 6111.92] ++ - - [768, 512, 1, 256, 800, 800, 768, 512] ++ - [24, 10137.3] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [54, 15728.6] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 20967.2] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [49, 25606.8] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 26324.1] ++ - - [768, 1024, 1, 64, 800, 800, 768, 1024] ++ - [77, 7600.7] ++ - - [768, 1024, 1, 128, 800, 800, 768, 1024] ++ - [39, 12523.4] ++ - - [768, 1024, 1, 256, 800, 800, 768, 1024] ++ - [24, 19886.1] ++ - - [768, 1024, 1, 512, 800, 800, 768, 1024] ++ - [24, 26775.8] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [17, 32280.7] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [39, 34310.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [26, 37191.5] ++ - - [768, 2048, 1, 64, 800, 800, 768, 2048] ++ - [77, 11795.6] ++ - - [768, 2048, 1, 128, 800, 800, 768, 2048] ++ - [73, 19079.5] ++ - - [768, 2048, 1, 256, 800, 800, 768, 2048] ++ - [73, 25839.3] ++ - - [768, 2048, 1, 512, 800, 800, 768, 2048] ++ - [59, 31127.8] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [2, 33297.1] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [71, 36502.0] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [71, 38152.0] ++ - - [768, 4096, 1, 64, 800, 800, 768, 4096] ++ - [81, 16746.5] ++ - - [768, 4096, 1, 128, 800, 800, 768, 4096] ++ - [61, 24034.8] ++ - - [768, 4096, 1, 256, 800, 800, 768, 4096] ++ - [71, 30200.9] ++ - - [768, 4096, 1, 512, 800, 800, 768, 4096] ++ - [26, 33339.1] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [26, 36892.0] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [26, 38878.4] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [40, 39582.1] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [2, 947.081] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [50, 1670.37] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [54, 2804.93] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [59, 4234.18] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [77, 5631.51] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [71, 6717.16] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [77, 7358.7] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] ++ - [9, 1962.09] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [10, 3492.83] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [61, 5653.97] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [78, 8494.09] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [48, 11357.7] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [78, 13691.0] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [78, 14899.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] ++ - [2, 3742.69] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] ++ - [26, 6331.82] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [39, 10391.6] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [28, 16107.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [48, 21539.2] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [26, 24863.6] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [31, 26156.5] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] ++ - [73, 8018.46] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] ++ - [61, 13105.5] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] ++ - [24, 19872.4] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [37, 26899.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [8, 31927.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [37, 34203.6] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [8, 37215.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] ++ - [79, 11781.8] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] ++ - [79, 19117.6] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] ++ - [46, 25847.6] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] ++ - [21, 30898.5] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [34, 33435.4] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [71, 36434.3] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [71, 38210.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] ++ - [75, 16816.5] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] ++ - [59, 24024.7] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] ++ - [61, 30174.8] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] ++ - [73, 33507.0] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] ++ - [27, 36943.2] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 38954.3] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [9, 39796.6] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] ++ - [75, 21348.5] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] ++ - [73, 28464.1] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] ++ - [46, 32408.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] ++ - [39, 36787.1] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] ++ - [26, 39067.0] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [71, 39153.4] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [8, 39715.5] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [4, 1768.51] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [45, 3140.23] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [24, 5187.77] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [6, 7705.4] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [37, 10029.2] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [26, 11978.4] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [17, 12881.2] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] ++ - [30, 3765.65] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [18, 6083.11] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [27, 9996.38] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [46, 16366.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 21266.2] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 24924.4] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [32, 26196.5] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] ++ - [61, 7920.04] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] ++ - [39, 13041.0] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [25, 19850.8] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [38, 26351.6] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [16, 31923.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [37, 34469.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [8, 37234.0] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] ++ - [64, 12213.5] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] ++ - [68, 19081.3] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] ++ - [15, 25816.1] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [61, 30738.1] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [79, 33456.2] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 36115.7] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 37885.0] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] ++ - [64, 16731.2] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] ++ - [71, 24119.7] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] ++ - [79, 30248.5] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] ++ - [47, 33607.0] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 36806.5] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 38860.1] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 39798.1] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] ++ - [75, 21318.0] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] ++ - [77, 28415.9] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] ++ - [39, 32521.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] ++ - [26, 36773.7] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] ++ - [40, 39002.4] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 39209.6] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [26, 39950.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] ++ - [53, 6862.26] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] ++ - [53, 12876.3] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] ++ - [53, 23076.2] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] ++ - [40, 34836.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] ++ - [25, 38469.0] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] ++ - [39, 40068.0] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [26, 40246.6] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [43, 2248.66] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [0, 4086.52] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [77, 6752.09] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [77, 10108.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [6, 13353.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [6, 15962.6] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [45, 16985.0] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] ++ - [35, 5693.95] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [1, 8636.95] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [26, 14180.4] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [61, 21198.4] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [47, 28723.5] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 33544.0] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 34958.8] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] ++ - [77, 9707.63] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] ++ - [50, 15822.0] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [61, 22295.3] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [25, 28417.9] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 33036.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [26, 35523.8] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [9, 38749.7] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] ++ - [70, 13761.7] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] ++ - [50, 20922.5] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] ++ - [61, 27401.2] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [59, 32510.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [39, 35154.5] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 37819.2] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [27, 39756.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] ++ - [75, 18664.7] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] ++ - [71, 26076.9] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] ++ - [73, 31845.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] ++ - [46, 35125.2] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [8, 37834.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [39, 39550.7] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [26, 40028.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] ++ - [76, 22394.8] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] ++ - [73, 29770.8] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] ++ - [46, 33621.7] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] ++ - [46, 37010.3] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] ++ - [47, 38908.1] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [26, 39496.0] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [8, 39951.5] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] ++ - [58, 7088.68] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] ++ - [52, 13399.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] ++ - [65, 24382.5] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] ++ - [38, 36186.0] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] ++ - [40, 38939.5] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] ++ - [46, 39122.0] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [26, 40501.1] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_SB.yaml +new file mode 100644 +index 00000000..a8cf5304 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bjlk_SB.yaml +@@ -0,0 +1,310 @@ ++- {MinimumRequiredVersion: 4.33.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 8 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: false ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 1 ++ GlobalLoadVectorWidthB: 1 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 1 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 8 ++ LVPB: 8 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 512 ++ LdsOffsetA: 0 ++ LdsOffsetB: 256 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 1 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 8 ++ LoopTail: true ++ LoopUnroll: 8 ++ MACInstruction: FMA ++ MIArchVgpr: false ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstruction: [] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: false ++ PrefetchLocalRead: true ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_ ++ SourceSwap: false ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [2, 2] ++ ThreadTile0: 2 ++ ThreadTile1: 2 ++ ThreadTileA: 2 ++ ThreadTileB: 2 ++ TransposeLDS: 0 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: 0 ++ UnrollMajorLDSB: 0 ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 8 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [126, 126, 2, 66, 126, 126, 126, 126] ++ - [0, 0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH.yaml +new file mode 100644 +index 00000000..2075c6b3 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH.yaml +@@ -0,0 +1,14343 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 31.5455] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [18, 58.1573] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [2, 92.6138] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [41, 148.408] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 214.872] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [41, 275.406] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 308.15] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [5, 55.8823] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [31, 103.635] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [22, 179.998] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [41, 290.042] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [41, 422.579] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [41, 545.885] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 615.982] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [5, 128.96] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [21, 235.053] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [22, 405.128] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [41, 646.222] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [44, 910.517] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 1146.1] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [44, 1274.98] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [31, 288.268] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [34, 526.46] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [44, 896.506] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [41, 1400.79] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 1890.07] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [44, 2351.15] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [44, 2575.12] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [34, 571.429] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [41, 1047.92] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [39, 1793.01] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [41, 2791.32] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [42, 3852.4] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 4840.68] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 5274.3] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [9, 1141.0] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [41, 2085.94] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [44, 3573.04] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [41, 5542.98] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [44, 7771.73] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 9778.7] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 10868.9] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [34, 2055.78] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [33, 3798.32] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [33, 6475.81] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [33, 10103.7] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [7, 14055.7] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 17325.1] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [34, 18171.0] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [5, 63.5655] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [5, 118.698] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [2, 188.677] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [49, 297.025] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [43, 425.537] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 549.424] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [41, 614.55] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [9, 138.958] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [9, 263.793] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [9, 450.129] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [49, 686.129] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 945.993] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [48, 1168.41] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 1286.4] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [22, 340.501] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [9, 610.791] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [9, 1016.19] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [41, 1489.98] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 2000.02] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [43, 2422.62] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 2606.32] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [28, 726.035] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [9, 1289.36] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [44, 2142.41] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [44, 3157.77] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 4194.3] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [39, 5001.78] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 5274.29] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [44, 1468.59] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [42, 2612.86] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [41, 4346.43] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [47, 6454.02] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 8498.02] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 10038.3] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [40, 10967.3] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [44, 2697.74] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [41, 4830.74] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [44, 8108.85] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [47, 12237.2] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 16675.1] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [41, 20214.3] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [38, 19144.9] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [45, 4424.36] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [32, 8058.22] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [21, 13687.3] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [33, 21218.5] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [9, 29324.4] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [33, 36226.1] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [22, 34507.1] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [14, 141.604] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [5, 263.394] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [1, 421.537] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [41, 643.445] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [41, 904.968] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 1138.48] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [41, 1262.65] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [9, 322.589] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [22, 613.023] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [41, 1013.61] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [8, 1491.04] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [48, 1980.55] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 2414.42] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 2587.13] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [28, 687.929] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [9, 1236.71] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [41, 2041.52] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [44, 3068.25] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [49, 4176.55] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 4988.02] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [44, 5248.0] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [22, 1408.43] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [41, 2521.37] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [44, 4170.32] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [49, 6216.09] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [44, 8264.64] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 9917.08] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 10609.7] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [49, 2777.22] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [49, 4954.88] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [49, 8125.54] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [44, 12225.0] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [41, 16373.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 19692.3] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 21657.6] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [33, 4637.79] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [21, 8376.04] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [21, 13583.4] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [21, 20875.3] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [9, 28780.5] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [33, 35400.8] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [21, 38470.2] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [49, 7849.91] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [3, 14079.3] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [23, 20854.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [27, 27932.9] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [18, 33924.4] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [28, 36534.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [26, 36580.9] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [5, 218.271] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [22, 404.231] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [5, 657.346] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [38, 983.424] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [43, 1377.14] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [44, 1729.85] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [44, 1904.52] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [5, 496.172] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [9, 943.954] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [9, 1554.6] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [41, 2252.18] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [44, 3023.83] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [41, 3653.97] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [49, 3913.43] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [9, 1050.15] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [44, 1875.25] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [47, 3101.14] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [44, 4620.97] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [49, 6264.82] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [41, 7468.99] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 7863.55] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [49, 2025.57] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [22, 3645.09] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [41, 6053.11] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [39, 9035.39] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [41, 12173.6] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [40, 14626.0] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 15811.7] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [21, 3646.69] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [33, 6572.41] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [21, 10867.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [33, 16249.1] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [8, 21993.3] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [21, 26658.7] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [33, 29005.9] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [12, 6183.99] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [28, 10738.6] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [2, 18176.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [1, 25557.1] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [4, 30740.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 32902.6] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [35, 35810.1] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [28, 10343.5] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [31, 16763.2] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [27, 23679.9] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [31, 30242.8] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [28, 34131.8] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 37668.5] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [28, 37185.0] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [2, 456.432] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [44, 815.695] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [22, 1358.85] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [41, 2109.63] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [49, 2877.73] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [44, 3511.47] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 3828.74] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [7, 1033.08] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [7, 1967.92] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [41, 3228.87] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [41, 4624.37] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [47, 6076.87] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [44, 7326.56] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 7814.26] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [22, 2032.12] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [49, 3641.94] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [49, 6055.28] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [41, 9016.78] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [41, 12240.2] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [41, 14694.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 15627.0] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [21, 3648.28] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [8, 6605.18] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [6, 10909.6] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [8, 16386.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [33, 21931.0] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [8, 26898.2] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 29091.3] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [3, 6200.01] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [5, 10852.0] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [2, 18448.3] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [5, 25720.4] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [31, 30377.5] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [10, 32716.7] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [11, 35693.8] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [14, 10831.0] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [14, 17308.0] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [18, 24651.2] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [2, 31447.4] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [31, 34654.7] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 38424.7] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [28, 40266.6] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [28, 13611.4] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [18, 20825.1] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [5, 28104.5] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [28, 33066.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 37666.3] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 40570.6] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [14, 40231.5] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [2, 966.578] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [0, 1768.5] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [20, 2846.17] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 4206.92] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [49, 5748.9] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [49, 7017.79] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [49, 7271.78] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [7, 1920.18] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [9, 3648.81] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [49, 6017.65] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [49, 9205.61] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [44, 12195.0] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [49, 14671.3] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [44, 13850.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [21, 3677.6] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [21, 6618.23] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [33, 11015.9] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [8, 16499.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [9, 22029.4] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [8, 26694.9] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [9, 28943.4] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [24, 5967.0] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [5, 10440.1] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [5, 17975.6] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [1, 25368.8] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [30, 30814.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [10, 32993.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [26, 35643.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [30, 10536.2] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [30, 16841.8] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [28, 24297.2] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [18, 31188.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [28, 34977.6] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [16, 38439.9] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 39310.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [31, 13682.7] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [28, 20853.1] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [14, 28171.3] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [14, 33011.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [31, 37838.4] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [14, 40624.1] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [13, 40569.6] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [31, 16107.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [28, 23625.0] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [14, 29528.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [13, 35602.3] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [28, 39645.1] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 41199.1] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 40470.4] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [19, 1581.17] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [21, 3002.72] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [21, 5094.8] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [9, 7754.66] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [33, 10734.0] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 13216.0] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [21, 14478.2] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [8, 4118.79] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [21, 6933.67] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [8, 11427.3] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [21, 17050.0] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [33, 23033.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 27546.0] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [8, 29441.2] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [22, 7468.71] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [31, 12682.8] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [5, 19388.2] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [5, 25789.6] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [30, 30932.9] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [37, 32913.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [10, 36545.9] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [30, 10490.1] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [14, 16913.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [28, 24281.1] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [18, 31168.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [14, 34598.1] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [16, 38508.8] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 37806.2] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [28, 13658.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [31, 20927.9] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [31, 28140.8] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [14, 32921.4] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [31, 37670.7] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [13, 40662.0] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [28, 39736.3] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [31, 16083.6] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [18, 23636.1] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [2, 29682.1] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [17, 35629.9] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [14, 39631.9] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [14, 41348.8] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [31, 40294.3] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [31, 16959.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [31, 24553.1] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [5, 31740.6] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [17, 37183.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [31, 39984.0] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [14, 41333.5] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [23, 38372.4] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [21, 2211.02] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [32, 4254.94] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [19, 7136.96] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [21, 10914.7] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [21, 14685.5] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 17868.9] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [9, 18919.9] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [19, 5265.92] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [21, 9489.38] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [33, 15606.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [33, 23017.9] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [22, 30352.3] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 36329.1] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 37923.3] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [33, 8889.77] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [18, 14731.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [28, 21226.9] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [14, 28508.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [28, 34183.6] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [29, 36300.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [11, 37577.9] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [36, 11821.2] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [31, 18642.6] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [31, 25960.9] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [18, 32563.3] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [31, 35803.3] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [31, 39464.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 39013.4] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [30, 14741.9] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [28, 22210.4] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [14, 29142.9] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [18, 34197.8] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [18, 38624.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [18, 41330.2] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [30, 40005.7] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [14, 16793.0] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [14, 23383.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [2, 30709.5] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [13, 36397.4] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [28, 40230.9] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [13, 41088.7] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [28, 40108.3] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [34, 17263.3] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [14, 24821.2] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [5, 31872.9] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [17, 37287.4] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [13, 39593.2] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [15, 41369.1] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 36883.9] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH_GB.yaml +new file mode 100644 +index 00000000..cdd9bc04 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_BBS_BH_GB.yaml +@@ -0,0 +1,14343 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 31.5455] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [18, 58.1573] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [2, 92.6138] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [41, 148.408] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 214.872] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [41, 275.406] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 308.15] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [5, 55.8823] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [31, 103.635] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [22, 179.998] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [41, 290.042] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [41, 422.579] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [41, 545.885] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 615.982] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [5, 128.96] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [21, 235.053] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [22, 405.128] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [41, 646.222] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [44, 910.517] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 1146.1] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [44, 1274.98] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [31, 288.268] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [34, 526.46] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [44, 896.506] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [41, 1400.79] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 1890.07] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [44, 2351.15] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [44, 2575.12] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [34, 571.429] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [41, 1047.92] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [39, 1793.01] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [41, 2791.32] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [42, 3852.4] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 4840.68] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [41, 5274.3] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [9, 1141.0] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [41, 2085.94] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [44, 3573.04] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [41, 5542.98] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [44, 7771.73] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 9778.7] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 10868.9] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [34, 2055.78] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [33, 3798.32] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [33, 6475.81] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [33, 10103.7] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [7, 14055.7] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 17325.1] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [34, 18171.0] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [5, 63.5655] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [5, 118.698] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [2, 188.677] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [49, 297.025] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [43, 425.537] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 549.424] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [41, 614.55] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [9, 138.958] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [9, 263.793] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [9, 450.129] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [49, 686.129] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 945.993] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [48, 1168.41] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 1286.4] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [22, 340.501] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [9, 610.791] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [9, 1016.19] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [41, 1489.98] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 2000.02] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [43, 2422.62] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 2606.32] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [28, 726.035] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [9, 1289.36] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [44, 2142.41] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [44, 3157.77] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 4194.3] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [39, 5001.78] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 5274.29] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [44, 1468.59] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [42, 2612.86] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [41, 4346.43] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [47, 6454.02] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 8498.02] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 10038.3] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [40, 10967.3] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [44, 2697.74] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [41, 4830.74] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [44, 8108.85] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [47, 12237.2] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 16675.1] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [41, 20214.3] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [38, 19144.9] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [45, 4424.36] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [32, 8058.22] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [21, 13687.3] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [33, 21218.5] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [9, 29324.4] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [33, 36226.1] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [22, 34507.1] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [14, 141.604] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [5, 263.394] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [1, 421.537] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [41, 643.445] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [41, 904.968] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 1138.48] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [41, 1262.65] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [9, 322.589] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [22, 613.023] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [41, 1013.61] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [8, 1491.04] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [48, 1980.55] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 2414.42] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 2587.13] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [28, 687.929] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [9, 1236.71] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [41, 2041.52] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [44, 3068.25] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [49, 4176.55] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 4988.02] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [44, 5248.0] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [22, 1408.43] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [41, 2521.37] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [44, 4170.32] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [49, 6216.09] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [44, 8264.64] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 9917.08] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 10609.7] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [49, 2777.22] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [49, 4954.88] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [49, 8125.54] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [44, 12225.0] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [41, 16373.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 19692.3] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 21657.6] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [33, 4637.79] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [21, 8376.04] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [21, 13583.4] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [21, 20875.3] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [9, 28780.5] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [33, 35400.8] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [21, 38470.2] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [49, 7849.91] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [3, 14079.3] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [23, 20854.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [27, 27932.9] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [18, 33924.4] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [28, 36534.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [26, 36580.9] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [5, 218.271] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [22, 404.231] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [5, 657.346] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [38, 983.424] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [43, 1377.14] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [44, 1729.85] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [44, 1904.52] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [5, 496.172] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [9, 943.954] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [9, 1554.6] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [41, 2252.18] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [44, 3023.83] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [41, 3653.97] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [49, 3913.43] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [9, 1050.15] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [44, 1875.25] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [47, 3101.14] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [44, 4620.97] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [49, 6264.82] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [41, 7468.99] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 7863.55] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [49, 2025.57] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [22, 3645.09] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [41, 6053.11] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [39, 9035.39] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [41, 12173.6] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [40, 14626.0] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 15811.7] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [21, 3646.69] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [33, 6572.41] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [21, 10867.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [33, 16249.1] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [8, 21993.3] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [21, 26658.7] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [33, 29005.9] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [12, 6183.99] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [28, 10738.6] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [2, 18176.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [1, 25557.1] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [4, 30740.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 32902.6] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [35, 35810.1] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [28, 10343.5] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [31, 16763.2] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [27, 23679.9] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [31, 30242.8] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [28, 34131.8] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 37668.5] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [28, 37185.0] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [2, 456.432] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [44, 815.695] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [22, 1358.85] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [41, 2109.63] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [49, 2877.73] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [44, 3511.47] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 3828.74] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [7, 1033.08] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [7, 1967.92] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [41, 3228.87] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [41, 4624.37] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [47, 6076.87] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [44, 7326.56] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 7814.26] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [22, 2032.12] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [49, 3641.94] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [49, 6055.28] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [41, 9016.78] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [41, 12240.2] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [41, 14694.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 15627.0] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [21, 3648.28] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [8, 6605.18] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [6, 10909.6] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [8, 16386.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [33, 21931.0] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [8, 26898.2] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 29091.3] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [3, 6200.01] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [5, 10852.0] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [2, 18448.3] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [5, 25720.4] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [31, 30377.5] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [10, 32716.7] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [11, 35693.8] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [14, 10831.0] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [14, 17308.0] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [18, 24651.2] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [2, 31447.4] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [31, 34654.7] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 38424.7] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [28, 40266.6] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [28, 13611.4] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [18, 20825.1] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [5, 28104.5] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [28, 33066.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 37666.3] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 40570.6] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [14, 40231.5] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [2, 966.578] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [0, 1768.5] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [20, 2846.17] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 4206.92] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [49, 5748.9] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [49, 7017.79] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [49, 7271.78] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [7, 1920.18] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [9, 3648.81] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [49, 6017.65] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [49, 9205.61] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [44, 12195.0] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [49, 14671.3] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [44, 13850.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [21, 3677.6] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [21, 6618.23] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [33, 11015.9] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [8, 16499.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [9, 22029.4] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [8, 26694.9] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [9, 28943.4] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [24, 5967.0] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [5, 10440.1] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [5, 17975.6] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [1, 25368.8] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [30, 30814.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [10, 32993.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [26, 35643.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [30, 10536.2] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [30, 16841.8] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [28, 24297.2] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [18, 31188.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [28, 34977.6] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [16, 38439.9] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 39310.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [31, 13682.7] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [28, 20853.1] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [14, 28171.3] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [14, 33011.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [31, 37838.4] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [14, 40624.1] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [13, 40569.6] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [31, 16107.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [28, 23625.0] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [14, 29528.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [13, 35602.3] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [28, 39645.1] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 41199.1] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 40470.4] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [19, 1581.17] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [21, 3002.72] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [21, 5094.8] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [9, 7754.66] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [33, 10734.0] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 13216.0] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [21, 14478.2] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [8, 4118.79] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [21, 6933.67] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [8, 11427.3] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [21, 17050.0] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [33, 23033.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [8, 27546.0] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [8, 29441.2] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [22, 7468.71] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [31, 12682.8] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [5, 19388.2] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [5, 25789.6] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [30, 30932.9] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [37, 32913.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [10, 36545.9] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [30, 10490.1] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [14, 16913.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [28, 24281.1] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [18, 31168.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [14, 34598.1] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [16, 38508.8] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 37806.2] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [28, 13658.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [31, 20927.9] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [31, 28140.8] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [14, 32921.4] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [31, 37670.7] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [13, 40662.0] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [28, 39736.3] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [31, 16083.6] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [18, 23636.1] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [2, 29682.1] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [17, 35629.9] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [14, 39631.9] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [14, 41348.8] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [31, 40294.3] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [31, 16959.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [31, 24553.1] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [5, 31740.6] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [17, 37183.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [31, 39984.0] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [14, 41333.5] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [23, 38372.4] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [21, 2211.02] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [32, 4254.94] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [19, 7136.96] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [21, 10914.7] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [21, 14685.5] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 17868.9] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [9, 18919.9] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [19, 5265.92] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [21, 9489.38] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [33, 15606.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [33, 23017.9] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [22, 30352.3] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [8, 36329.1] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 37923.3] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [33, 8889.77] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [18, 14731.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [28, 21226.9] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [14, 28508.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [28, 34183.6] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [29, 36300.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [11, 37577.9] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [36, 11821.2] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [31, 18642.6] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [31, 25960.9] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [18, 32563.3] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [31, 35803.3] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [31, 39464.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 39013.4] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [30, 14741.9] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [28, 22210.4] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [14, 29142.9] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [18, 34197.8] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [18, 38624.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [18, 41330.2] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [30, 40005.7] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [14, 16793.0] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [14, 23383.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [2, 30709.5] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [13, 36397.4] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [28, 40230.9] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [13, 41088.7] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [28, 40108.3] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [34, 17263.3] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [14, 24821.2] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [5, 31872.9] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [17, 37287.4] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [13, 39593.2] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [15, 41369.1] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [17, 36883.9] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB.yaml +new file mode 100644 +index 00000000..85c26f72 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB.yaml +@@ -0,0 +1,22713 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 36.787] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [29, 61.6809] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [71, 105.778] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [71, 165.352] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [71, 228.523] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 285.911] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [71, 316.438] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [35, 64.9677] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [29, 119.048] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [29, 204.52] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [77, 324.085] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [71, 455.878] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [71, 574.032] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [80, 633.976] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [29, 154.361] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [29, 280.48] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [77, 483.549] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [71, 727.546] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [71, 992.327] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [61, 1191.18] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 1301.8] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [71, 331.568] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [77, 597.056] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [74, 1002.7] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [71, 1479.99] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [77, 2015.04] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 2450.3] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [74, 2626.16] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [48, 682.001] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [77, 1233.08] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [80, 2064.63] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [79, 3019.93] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [64, 4102.27] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [80, 5002.53] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [62, 5363.78] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [70, 1281.5] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [66, 2328.88] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [76, 3924.04] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [60, 6173.2] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [60, 8336.5] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [79, 10165.3] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [67, 11151.6] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [11, 2210.15] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [72, 3995.05] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [58, 6519.22] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [13, 9791.93] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [15, 13509.6] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [12, 16861.5] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [68, 14846.1] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [6, 73.7603] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [2, 128.691] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [8, 204.421] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [77, 322.341] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [80, 451.364] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [74, 575.667] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [71, 630.285] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [34, 167.772] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [19, 306.154] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [49, 506.314] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [71, 781.355] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [71, 1036.01] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [80, 1222.96] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [71, 1305.11] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [44, 386.287] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [10, 679.13] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [49, 1111.22] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [63, 1634.73] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [67, 2098.2] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 2503.6] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [77, 2658.73] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [67, 798.767] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [10, 1399.97] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [77, 2307.73] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [80, 3356.46] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [63, 4361.97] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [74, 5071.52] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [62, 5336.48] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [71, 1512.29] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [77, 2705.57] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [77, 4459.08] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [69, 6578.67] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [65, 8665.37] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [65, 10514.5] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [80, 11038.3] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [63, 2750.36] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [74, 4932.32] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [77, 8253.46] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [73, 12729.3] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [65, 17271.6] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 20606.1] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [64, 18206.4] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [78, 4783.92] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [68, 8345.84] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [80, 13684.5] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [53, 20285.4] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 27543.2] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [32, 34444.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [68, 29748.9] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [44, 163.457] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [14, 283.9] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [77, 455.46] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [77, 721.789] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [76, 977.866] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [71, 1197.13] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 1282.91] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [45, 377.253] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [10, 682.006] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [77, 1113.73] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [71, 1630.44] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [63, 2126.53] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [67, 2480.46] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [74, 2627.34] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [49, 802.891] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [29, 1400.9] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [71, 2296.68] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [71, 3342.76] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [77, 4234.53] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [77, 5028.58] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [71, 5335.53] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [71, 1605.78] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [63, 2827.3] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [63, 4428.47] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [77, 6535.73] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [77, 8538.59] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [74, 10038.7] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [63, 10651.8] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [77, 2963.65] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [80, 5261.79] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [71, 8682.74] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [61, 12774.2] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [77, 16608.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [65, 20356.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [66, 21677.3] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [30, 4790.75] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [45, 8530.43] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [10, 14281.5] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [28, 21160.0] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [22, 28420.9] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [8, 35298.5] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 31999.0] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [71, 9346.64] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [24, 15319.9] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [45, 24103.1] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [45, 30483.3] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [24, 36005.1] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 38255.1] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [41, 37032.3] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [44, 250.457] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [29, 416.267] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [71, 701.702] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [76, 1074.55] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [70, 1463.55] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [70, 1812.19] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [71, 1934.61] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [39, 548.516] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [77, 1038.19] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [77, 1701.77] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [71, 2468.45] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 3202.58] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [62, 3768.05] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [70, 3949.37] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [63, 1146.19] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [29, 2032.45] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [63, 3489.93] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [71, 5045.79] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [77, 6521.33] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [77, 7541.18] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [76, 8141.82] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [77, 2310.49] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [74, 4098.02] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [61, 6701.08] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [29, 9589.72] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 12523.4] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [63, 15113.5] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [74, 15963.1] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [17, 3942.63] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [6, 6739.64] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [2, 11123.0] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [5, 16711.8] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [36, 21789.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [6, 25744.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [55, 28410.4] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [59, 7146.34] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [33, 12191.3] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [45, 18925.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [59, 26637.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [45, 31853.0] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [51, 33147.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 36142.8] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [15, 12686.0] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [32, 20040.5] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [6, 27488.7] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [44, 33494.5] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [0, 36072.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [45, 38920.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [53, 37168.5] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [4, 519.187] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [1, 905.115] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [62, 1460.59] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [71, 2215.5] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [71, 2979.09] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [71, 3611.11] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [77, 3905.31] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [57, 1119.08] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [10, 2030.48] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [77, 3460.17] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [77, 5031.16] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [63, 6476.03] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [77, 7572.38] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [64, 7588.36] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [77, 2205.98] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [71, 3886.63] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [77, 6449.47] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [71, 9565.14] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [77, 12743.8] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [77, 15041.2] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [67, 15194.5] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [14, 3786.61] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [71, 6866.55] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [0, 11513.6] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [4, 16846.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [3, 21862.0] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [0, 25894.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [36, 27419.8] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [34, 6767.74] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [5, 11532.1] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [25, 18934.1] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [25, 25710.6] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 30780.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [42, 33085.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 35682.8] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [22, 13259.1] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [23, 20695.6] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [44, 27503.7] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [25, 34032.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [25, 36592.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [2, 39840.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [25, 40606.7] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [44, 18355.8] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [44, 26355.1] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [6, 33211.3] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [2, 36022.8] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 39866.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [23, 42018.9] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 40810.0] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [5, 1044.92] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [9, 1758.37] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [48, 2922.18] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [70, 4416.61] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [76, 5931.5] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 7239.89] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [64, 7182.67] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [10, 2125.85] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [39, 3885.41] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [49, 6359.84] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [63, 9418.37] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [77, 12395.5] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 15041.8] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [60, 15514.4] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [53, 3786.64] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [45, 6720.74] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [0, 10987.0] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [6, 16204.7] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [2, 21645.7] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [6, 26488.6] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 24964.2] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [34, 6832.04] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [59, 11611.9] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [13, 19552.0] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [16, 26478.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [6, 31531.2] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [6, 33267.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [51, 36010.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [52, 13257.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [54, 20778.9] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [45, 28500.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 34005.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [25, 36553.3] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [23, 40005.3] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [25, 40401.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [45, 17843.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [25, 25753.4] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [23, 32806.7] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [23, 36010.7] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [23, 39857.8] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 41968.8] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [6, 41102.7] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [50, 22328.7] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [54, 30248.6] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [23, 34817.3] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 39279.0] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [25, 41810.0] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [44, 42416.8] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [25, 41505.7] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [40, 2025.58] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [7, 3677.08] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [46, 5695.56] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [46, 8489.8] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [7, 11028.6] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [7, 13281.9] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [37, 13892.0] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [25, 4517.31] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [75, 7468.71] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [27, 12489.2] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [47, 18088.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 23391.1] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [27, 27574.3] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [56, 24550.9] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [23, 8080.21] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [59, 14051.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [14, 20655.2] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [23, 27078.2] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [44, 31407.0] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [51, 33095.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [20, 36830.0] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [34, 12782.6] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [45, 20817.6] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [45, 28371.8] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [25, 34249.4] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [44, 36287.4] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [44, 39622.0] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [0, 38846.3] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [51, 18282.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [44, 26310.4] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [25, 33269.0] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [45, 35983.3] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [23, 39745.2] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [25, 41865.1] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 40445.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [51, 22383.3] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [25, 30054.4] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [23, 34728.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [45, 39256.0] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [44, 41769.3] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [45, 42628.2] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 41025.3] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [50, 25967.6] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [44, 31698.8] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [25, 37702.1] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [23, 40959.6] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [54, 42079.5] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [44, 42717.8] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [21, 40209.8] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [9, 2530.88] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [38, 4329.6] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [48, 7176.65] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [26, 10639.5] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [57, 14002.9] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 17752.5] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 17227.3] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [44, 6028.46] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [6, 10596.7] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [17, 17060.9] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [6, 24385.5] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [27, 30782.2] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 35920.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [48, 33528.8] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [45, 9961.24] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [30, 16153.3] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [44, 23540.8] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [53, 30528.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [45, 36224.9] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 37845.9] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [20, 38296.0] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [34, 14835.6] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [44, 22448.3] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [23, 29799.7] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [53, 35931.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [23, 37623.1] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 40661.7] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [59, 39759.2] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [50, 20171.0] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [45, 28238.6] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [44, 34688.3] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [25, 37404.8] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [23, 40765.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [45, 42570.8] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [43, 41104.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [31, 23998.5] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [34, 31414.4] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [25, 36192.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [45, 39979.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [44, 42379.5] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 42863.8] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 41070.2] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [45, 23824.4] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [53, 30552.7] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [14, 35533.5] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [25, 40993.9] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [25, 42468.8] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 42915.8] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [30, 37918.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB_GB.yaml +new file mode 100644 +index 00000000..a70aa67a +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HB_GB.yaml +@@ -0,0 +1,22713 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 36.787] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [29, 61.6809] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [71, 105.778] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [71, 165.352] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [71, 228.523] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 285.911] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [71, 316.438] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [35, 64.9677] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [29, 119.048] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [29, 204.52] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [77, 324.085] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [71, 455.878] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [71, 574.032] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [80, 633.976] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [29, 154.361] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [29, 280.48] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [77, 483.549] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [71, 727.546] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [71, 992.327] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [61, 1191.18] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 1301.8] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [71, 331.568] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [77, 597.056] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [74, 1002.7] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [71, 1479.99] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [77, 2015.04] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [77, 2450.3] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [74, 2626.16] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [48, 682.001] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [77, 1233.08] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [80, 2064.63] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [79, 3019.93] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [64, 4102.27] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [80, 5002.53] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [62, 5363.78] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [70, 1281.5] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [66, 2328.88] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [76, 3924.04] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [60, 6173.2] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [60, 8336.5] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [79, 10165.3] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [67, 11151.6] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [11, 2210.15] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [72, 3995.05] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [58, 6519.22] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [13, 9791.93] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [15, 13509.6] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [12, 16861.5] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [68, 14846.1] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [6, 73.7603] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [2, 128.691] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [8, 204.421] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [77, 322.341] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [80, 451.364] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [74, 575.667] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [71, 630.285] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [34, 167.772] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [19, 306.154] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [49, 506.314] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [71, 781.355] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [71, 1036.01] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [80, 1222.96] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [71, 1305.11] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [44, 386.287] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [10, 679.13] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [49, 1111.22] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [63, 1634.73] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [67, 2098.2] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 2503.6] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [77, 2658.73] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [67, 798.767] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [10, 1399.97] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [77, 2307.73] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [80, 3356.46] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [63, 4361.97] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [74, 5071.52] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [62, 5336.48] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [71, 1512.29] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [77, 2705.57] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [77, 4459.08] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [69, 6578.67] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [65, 8665.37] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [65, 10514.5] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [80, 11038.3] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [63, 2750.36] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [74, 4932.32] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [77, 8253.46] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [73, 12729.3] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [65, 17271.6] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [67, 20606.1] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [64, 18206.4] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [78, 4783.92] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [68, 8345.84] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [80, 13684.5] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [53, 20285.4] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 27543.2] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [32, 34444.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [68, 29748.9] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [44, 163.457] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [14, 283.9] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [77, 455.46] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [77, 721.789] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [76, 977.866] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [71, 1197.13] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 1282.91] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [45, 377.253] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [10, 682.006] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [77, 1113.73] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [71, 1630.44] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [63, 2126.53] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [67, 2480.46] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [74, 2627.34] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [49, 802.891] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [29, 1400.9] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [71, 2296.68] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [71, 3342.76] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [77, 4234.53] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [77, 5028.58] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [71, 5335.53] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [71, 1605.78] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [63, 2827.3] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [63, 4428.47] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [77, 6535.73] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [77, 8538.59] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [74, 10038.7] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [63, 10651.8] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [77, 2963.65] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [80, 5261.79] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [71, 8682.74] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [61, 12774.2] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [77, 16608.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [65, 20356.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [66, 21677.3] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [30, 4790.75] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [45, 8530.43] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [10, 14281.5] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [28, 21160.0] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [22, 28420.9] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [8, 35298.5] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [77, 31999.0] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [71, 9346.64] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [24, 15319.9] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [45, 24103.1] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [45, 30483.3] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [24, 36005.1] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 38255.1] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [41, 37032.3] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [44, 250.457] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [29, 416.267] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [71, 701.702] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [76, 1074.55] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [70, 1463.55] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [70, 1812.19] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [71, 1934.61] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [39, 548.516] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [77, 1038.19] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [77, 1701.77] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [71, 2468.45] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 3202.58] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [62, 3768.05] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [70, 3949.37] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [63, 1146.19] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [29, 2032.45] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [63, 3489.93] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [71, 5045.79] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [77, 6521.33] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [77, 7541.18] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [76, 8141.82] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [77, 2310.49] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [74, 4098.02] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [61, 6701.08] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [29, 9589.72] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 12523.4] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [63, 15113.5] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [74, 15963.1] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [17, 3942.63] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [6, 6739.64] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [2, 11123.0] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [5, 16711.8] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [36, 21789.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [6, 25744.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [55, 28410.4] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [59, 7146.34] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [33, 12191.3] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [45, 18925.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [59, 26637.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [45, 31853.0] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [51, 33147.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 36142.8] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [15, 12686.0] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [32, 20040.5] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [6, 27488.7] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [44, 33494.5] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [0, 36072.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [45, 38920.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [53, 37168.5] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [4, 519.187] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [1, 905.115] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [62, 1460.59] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [71, 2215.5] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [71, 2979.09] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [71, 3611.11] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [77, 3905.31] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [57, 1119.08] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [10, 2030.48] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [77, 3460.17] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [77, 5031.16] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [63, 6476.03] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [77, 7572.38] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [64, 7588.36] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [77, 2205.98] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [71, 3886.63] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [77, 6449.47] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [71, 9565.14] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [77, 12743.8] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [77, 15041.2] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [67, 15194.5] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [14, 3786.61] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [71, 6866.55] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [0, 11513.6] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [4, 16846.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [3, 21862.0] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [0, 25894.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [36, 27419.8] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [34, 6767.74] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [5, 11532.1] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [25, 18934.1] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [25, 25710.6] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 30780.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [42, 33085.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 35682.8] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [22, 13259.1] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [23, 20695.6] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [44, 27503.7] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [25, 34032.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [25, 36592.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [2, 39840.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [25, 40606.7] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [44, 18355.8] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [44, 26355.1] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [6, 33211.3] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [2, 36022.8] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 39866.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [23, 42018.9] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 40810.0] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [5, 1044.92] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [9, 1758.37] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [48, 2922.18] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [70, 4416.61] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [76, 5931.5] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 7239.89] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [64, 7182.67] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [10, 2125.85] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [39, 3885.41] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [49, 6359.84] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [63, 9418.37] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [77, 12395.5] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 15041.8] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [60, 15514.4] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [53, 3786.64] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [45, 6720.74] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [0, 10987.0] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [6, 16204.7] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [2, 21645.7] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [6, 26488.6] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 24964.2] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [34, 6832.04] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [59, 11611.9] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [13, 19552.0] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [16, 26478.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [6, 31531.2] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [6, 33267.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [51, 36010.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [52, 13257.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [54, 20778.9] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [45, 28500.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 34005.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [25, 36553.3] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [23, 40005.3] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [25, 40401.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [45, 17843.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [25, 25753.4] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [23, 32806.7] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [23, 36010.7] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [23, 39857.8] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 41968.8] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [6, 41102.7] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [50, 22328.7] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [54, 30248.6] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [23, 34817.3] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 39279.0] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [25, 41810.0] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [44, 42416.8] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [25, 41505.7] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [40, 2025.58] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [7, 3677.08] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [46, 5695.56] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [46, 8489.8] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [7, 11028.6] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [7, 13281.9] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [37, 13892.0] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [25, 4517.31] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [75, 7468.71] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [27, 12489.2] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [47, 18088.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [8, 23391.1] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [27, 27574.3] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [56, 24550.9] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [23, 8080.21] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [59, 14051.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [14, 20655.2] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [23, 27078.2] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [44, 31407.0] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [51, 33095.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [20, 36830.0] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [34, 12782.6] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [45, 20817.6] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [45, 28371.8] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [25, 34249.4] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [44, 36287.4] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [44, 39622.0] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [0, 38846.3] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [51, 18282.5] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [44, 26310.4] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [25, 33269.0] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [45, 35983.3] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [23, 39745.2] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [25, 41865.1] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 40445.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [51, 22383.3] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [25, 30054.4] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [23, 34728.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [45, 39256.0] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [44, 41769.3] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [45, 42628.2] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [24, 41025.3] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [50, 25967.6] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [44, 31698.8] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [25, 37702.1] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [23, 40959.6] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [54, 42079.5] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [44, 42717.8] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [21, 40209.8] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [9, 2530.88] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [38, 4329.6] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [48, 7176.65] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [26, 10639.5] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [57, 14002.9] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 17752.5] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 17227.3] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [44, 6028.46] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [6, 10596.7] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [17, 17060.9] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [6, 24385.5] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [27, 30782.2] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [7, 35920.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [48, 33528.8] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [45, 9961.24] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [30, 16153.3] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [44, 23540.8] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [53, 30528.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [45, 36224.9] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 37845.9] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [20, 38296.0] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [34, 14835.6] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [44, 22448.3] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [23, 29799.7] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [53, 35931.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [23, 37623.1] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 40661.7] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [59, 39759.2] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [50, 20171.0] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [45, 28238.6] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [44, 34688.3] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [25, 37404.8] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [23, 40765.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [45, 42570.8] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [43, 41104.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [31, 23998.5] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [34, 31414.4] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [25, 36192.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [45, 39979.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [44, 42379.5] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 42863.8] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 41070.2] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [45, 23824.4] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [53, 30552.7] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [14, 35533.5] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [25, 40993.9] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [25, 42468.8] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [25, 42915.8] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [30, 37918.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH.yaml +new file mode 100644 +index 00000000..37b49c2b +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH.yaml +@@ -0,0 +1,27843 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 95 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 96 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 97 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 98 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 99 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 35.3438] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [10, 59.4905] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [80, 99.0533] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [83, 158.228] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [67, 223.232] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [80, 280.387] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [75, 312.98] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [23, 65.4054] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [23, 114.737] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [83, 198.051] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [67, 314.417] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [80, 444.948] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [68, 561.694] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [79, 624.758] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [19, 148.776] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [87, 266.036] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [83, 453.832] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [80, 677.703] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [87, 942.385] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [79, 1182.04] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [79, 1286.9] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [34, 298.019] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [68, 543.307] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [83, 954.23] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [5, 1422.41] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [79, 1943.39] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [90, 2382.8] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [84, 2578.09] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [83, 640.841] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [78, 1160.25] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [66, 1944.06] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [86, 2955.31] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [76, 3905.1] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [75, 4816.38] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [71, 5189.58] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [83, 1203.02] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [77, 2285.42] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [83, 3851.97] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [79, 5670.86] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [85, 7965.46] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [79, 9678.61] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [89, 10660.7] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [6, 2186.53] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [6, 3834.38] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [6, 6552.34] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [16, 10422.3] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [33, 14322.0] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [27, 17879.7] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [14, 18480.7] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [1, 73.5639] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [11, 118.873] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [87, 204.143] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [9, 317.438] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [68, 445.517] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [73, 559.988] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [75, 622.903] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [42, 170.751] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [34, 296.46] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [27, 511.19] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [68, 761.289] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [27, 999.126] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [79, 1217.24] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [75, 1299.3] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [10, 375.363] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [27, 662.924] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [66, 1085.77] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [7, 1584.11] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [79, 2060.71] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 2473.52] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 2668.41] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [8, 778.312] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [68, 1381.52] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [68, 2269.96] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [7, 3219.89] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [74, 4286.2] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [80, 4974.72] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [85, 5294.81] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [22, 1567.38] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [68, 2786.46] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [68, 4570.23] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [76, 6603.95] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [76, 8363.02] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [71, 10050.8] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [70, 10578.1] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [27, 2719.17] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [87, 5084.79] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [80, 8438.18] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [79, 12438.0] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [90, 16673.1] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 20115.1] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [90, 21675.2] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [97, 4774.41] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [6, 8442.51] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [17, 13930.3] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [27, 21452.6] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [21, 30345.4] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 36544.4] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [27, 34812.1] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [10, 162.848] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [78, 267.019] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [78, 453.195] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [78, 695.518] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [83, 937.436] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [68, 1166.63] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [76, 1265.4] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [56, 367.214] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [78, 666.506] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [27, 1088.16] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [83, 1545.01] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [85, 2095.06] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [88, 2436.52] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [85, 2614.71] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [27, 781.794] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [87, 1389.08] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [7, 2230.43] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [83, 3285.14] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [85, 4186.2] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [79, 5019.76] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [79, 5269.76] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [99, 1557.49] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [68, 2791.09] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [78, 4560.9] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [7, 6477.72] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [85, 8405.43] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [82, 9929.21] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [90, 10515.4] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [68, 2916.77] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [83, 5190.17] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [7, 8400.16] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [77, 12208.3] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [71, 16455.4] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [87, 19471.7] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [85, 21216.1] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [56, 4964.43] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [17, 8974.2] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [27, 14845.5] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [17, 22319.5] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [17, 29610.7] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [17, 36750.7] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [26, 39371.0] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [92, 8456.26] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [18, 14592.1] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [36, 23638.3] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [58, 31683.3] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [47, 36785.9] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [62, 39229.3] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [65, 38514.8] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [11, 238.639] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [27, 412.285] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [83, 674.691] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [78, 1060.24] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [85, 1426.07] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [79, 1773.06] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [71, 1903.95] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [27, 560.139] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [27, 1016.89] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [72, 1655.87] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [68, 2424.46] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [87, 3153.62] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [84, 3688.8] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [82, 3913.14] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [17, 1193.15] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [5, 2066.5] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [68, 3279.37] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [83, 4937.88] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [71, 6428.48] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [84, 7479.0] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [71, 7907.75] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [21, 2154.99] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [74, 3838.01] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [78, 6580.2] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [71, 9646.74] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [69, 12321.9] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [69, 14693.3] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [81, 15659.0] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [80, 3818.22] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [5, 7041.39] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [7, 11616.0] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [6, 16951.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [7, 23068.1] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [16, 27417.6] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [7, 29648.4] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [37, 6673.54] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [96, 12415.3] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [57, 21413.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [53, 28504.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 35455.8] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [57, 38689.8] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 40022.8] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [22, 11686.1] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [30, 19168.6] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [12, 26745.6] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [13, 32553.6] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [3, 35289.7] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 38532.5] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [25, 37187.9] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [1, 489.153] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [68, 827.282] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [78, 1393.92] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [68, 2180.94] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [7, 2891.14] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [85, 3566.85] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [71, 3823.73] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [27, 1092.08] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [93, 2065.83] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [68, 3411.4] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [79, 4928.21] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [85, 6244.26] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [76, 7352.01] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [76, 7789.5] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [42, 2291.98] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [27, 4029.77] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [79, 6560.43] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [7, 9468.0] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [79, 12546.1] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [76, 14596.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [84, 15715.5] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [7, 3751.05] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [32, 6741.47] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [17, 11218.5] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [27, 17317.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [27, 22698.8] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [27, 27706.3] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [27, 29548.3] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [29, 7174.89] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [95, 12006.6] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [96, 19163.1] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [39, 29144.0] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [55, 35034.8] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [35, 37956.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [38, 40750.9] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [25, 12988.8] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [11, 19550.2] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [65, 27524.4] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [2, 33781.1] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [37, 36378.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [56, 40192.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [58, 41995.9] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [60, 17393.3] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [63, 25542.6] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [40, 32287.2] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [58, 35993.8] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 40278.5] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [42, 42644.7] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [60, 42741.3] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [9, 983.809] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [27, 1747.87] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [9, 2885.34] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [85, 4350.19] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [79, 5848.11] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [70, 7060.63] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [82, 7795.5] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [27, 2216.08] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [17, 4002.85] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [68, 6505.32] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [79, 9543.38] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [7, 12204.6] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [71, 14911.4] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [82, 15818.6] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [17, 3796.32] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [33, 7098.96] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [27, 11779.0] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [27, 16995.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [27, 22810.6] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [7, 27881.7] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [6, 29707.4] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [94, 6461.89] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [98, 12026.7] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [43, 19672.3] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [43, 28391.8] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [46, 35477.6] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 37564.4] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [50, 39385.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [24, 12490.8] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [31, 19632.0] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [47, 27721.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [60, 33755.6] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [56, 36597.3] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [42, 40184.9] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [45, 42014.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [96, 17813.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [62, 25087.4] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [65, 32311.8] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [45, 36053.4] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [37, 40237.1] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [56, 42683.6] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [58, 43264.3] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [51, 21697.0] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [65, 29651.5] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [62, 34229.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [40, 38868.4] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [60, 41956.9] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [54, 43003.1] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [37, 43321.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [0, 1783.29] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [64, 3207.06] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [26, 5506.73] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [32, 8069.19] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [4, 11091.7] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [4, 13544.6] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [6, 14702.6] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [32, 4081.39] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [26, 7329.47] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [7, 12504.8] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [26, 17778.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [26, 23416.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [7, 28229.6] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [16, 29952.6] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [11, 8056.9] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [59, 13443.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [41, 21435.9] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [53, 28453.9] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [57, 35023.9] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [46, 38268.6] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [28, 37941.3] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [23, 12455.2] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [13, 19616.7] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [45, 27865.2] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [47, 33788.1] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [42, 36544.1] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [42, 40250.1] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [47, 41325.5] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [47, 17316.9] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [65, 25113.9] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [52, 32737.3] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [56, 36306.1] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [56, 40172.9] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [37, 42758.9] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [45, 43195.4] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [62, 21694.6] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [62, 29738.0] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [61, 34272.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [48, 38863.2] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [47, 41971.9] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [42, 43036.4] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [37, 42925.6] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [51, 25010.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [49, 30886.0] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [48, 36642.7] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [49, 40414.3] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [45, 42176.5] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [42, 43321.3] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [44, 41457.0] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [4, 2489.94] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [32, 4083.03] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [16, 6885.06] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [26, 10896.0] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [16, 14619.1] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [6, 18105.7] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [20, 19414.2] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [10, 5634.67] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [9, 10540.1] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [17, 17084.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [7, 24790.8] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [17, 31513.9] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [15, 37249.0] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 38655.1] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [48, 10338.8] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [40, 16927.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [60, 24593.2] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [58, 31291.6] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [60, 37382.6] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [40, 39381.7] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [34, 41215.3] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [54, 14514.7] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [61, 22022.8] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [40, 30156.2] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [42, 35698.5] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [60, 38757.6] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [40, 41716.1] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [46, 42775.1] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [61, 19273.1] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [65, 27091.4] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [61, 33915.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [62, 37419.7] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [54, 41089.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [60, 43202.6] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [58, 42751.4] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [61, 23254.2] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [56, 30652.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [62, 35241.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [58, 39681.5] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [58, 42476.9] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [42, 42935.5] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [61, 42847.9] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [58, 23153.0] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [60, 30122.8] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [91, 35267.4] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [47, 40597.4] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [47, 41985.1] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [40, 43431.8] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [99, 40503.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH_GB.yaml +new file mode 100644 +index 00000000..93668f8f +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_HHS_BH_GB.yaml +@@ -0,0 +1,27843 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 1 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 1 ++ LSPB: 4 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 95 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 96 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 97 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 98 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 99 ++ SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 35.3438] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [10, 59.4905] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [80, 99.0533] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [83, 158.228] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [67, 223.232] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [80, 280.387] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [75, 312.98] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [23, 65.4054] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [23, 114.737] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [83, 198.051] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [67, 314.417] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [80, 444.948] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [68, 561.694] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [79, 624.758] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [19, 148.776] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [87, 266.036] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [83, 453.832] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [80, 677.703] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [87, 942.385] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [79, 1182.04] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [79, 1286.9] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [34, 298.019] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [68, 543.307] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [83, 954.23] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [5, 1422.41] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [79, 1943.39] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [90, 2382.8] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [84, 2578.09] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [83, 640.841] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [78, 1160.25] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [66, 1944.06] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [86, 2955.31] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [76, 3905.1] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [75, 4816.38] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [71, 5189.58] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [83, 1203.02] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [77, 2285.42] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [83, 3851.97] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [79, 5670.86] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [85, 7965.46] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [79, 9678.61] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [89, 10660.7] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [6, 2186.53] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [6, 3834.38] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [6, 6552.34] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [16, 10422.3] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [33, 14322.0] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [27, 17879.7] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [14, 18480.7] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [1, 73.5639] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [11, 118.873] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [87, 204.143] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [9, 317.438] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [68, 445.517] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [73, 559.988] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [75, 622.903] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [42, 170.751] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [34, 296.46] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [27, 511.19] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [68, 761.289] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [27, 999.126] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [79, 1217.24] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [75, 1299.3] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [10, 375.363] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [27, 662.924] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [66, 1085.77] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [7, 1584.11] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [79, 2060.71] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 2473.52] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [67, 2668.41] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [8, 778.312] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [68, 1381.52] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [68, 2269.96] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [7, 3219.89] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [74, 4286.2] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [80, 4974.72] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [85, 5294.81] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [22, 1567.38] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [68, 2786.46] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [68, 4570.23] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [76, 6603.95] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [76, 8363.02] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [71, 10050.8] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [70, 10578.1] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [27, 2719.17] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [87, 5084.79] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [80, 8438.18] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [79, 12438.0] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [90, 16673.1] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 20115.1] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [90, 21675.2] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [97, 4774.41] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [6, 8442.51] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [17, 13930.3] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [27, 21452.6] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [21, 30345.4] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [27, 36544.4] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [27, 34812.1] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [10, 162.848] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [78, 267.019] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [78, 453.195] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [78, 695.518] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [83, 937.436] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [68, 1166.63] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [76, 1265.4] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [56, 367.214] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [78, 666.506] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [27, 1088.16] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [83, 1545.01] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [85, 2095.06] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [88, 2436.52] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [85, 2614.71] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [27, 781.794] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [87, 1389.08] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [7, 2230.43] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [83, 3285.14] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [85, 4186.2] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [79, 5019.76] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [79, 5269.76] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [99, 1557.49] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [68, 2791.09] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [78, 4560.9] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [7, 6477.72] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [85, 8405.43] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [82, 9929.21] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [90, 10515.4] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [68, 2916.77] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [83, 5190.17] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [7, 8400.16] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [77, 12208.3] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [71, 16455.4] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [87, 19471.7] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [85, 21216.1] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [56, 4964.43] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [17, 8974.2] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [27, 14845.5] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [17, 22319.5] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [17, 29610.7] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [17, 36750.7] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [26, 39371.0] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [92, 8456.26] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [18, 14592.1] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [36, 23638.3] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [58, 31683.3] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [47, 36785.9] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [62, 39229.3] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [65, 38514.8] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [11, 238.639] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [27, 412.285] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [83, 674.691] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [78, 1060.24] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [85, 1426.07] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [79, 1773.06] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [71, 1903.95] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [27, 560.139] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [27, 1016.89] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [72, 1655.87] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [68, 2424.46] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [87, 3153.62] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [84, 3688.8] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [82, 3913.14] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [17, 1193.15] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [5, 2066.5] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [68, 3279.37] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [83, 4937.88] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [71, 6428.48] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [84, 7479.0] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [71, 7907.75] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [21, 2154.99] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [74, 3838.01] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [78, 6580.2] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [71, 9646.74] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [69, 12321.9] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [69, 14693.3] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [81, 15659.0] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [80, 3818.22] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [5, 7041.39] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [7, 11616.0] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [6, 16951.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [7, 23068.1] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [16, 27417.6] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [7, 29648.4] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [37, 6673.54] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [96, 12415.3] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [57, 21413.3] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [53, 28504.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [59, 35455.8] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [57, 38689.8] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 40022.8] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [22, 11686.1] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [30, 19168.6] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [12, 26745.6] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [13, 32553.6] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [3, 35289.7] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 38532.5] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [25, 37187.9] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [1, 489.153] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [68, 827.282] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [78, 1393.92] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [68, 2180.94] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [7, 2891.14] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [85, 3566.85] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [71, 3823.73] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [27, 1092.08] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [93, 2065.83] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [68, 3411.4] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [79, 4928.21] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [85, 6244.26] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [76, 7352.01] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [76, 7789.5] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [42, 2291.98] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [27, 4029.77] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [79, 6560.43] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [7, 9468.0] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [79, 12546.1] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [76, 14596.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [84, 15715.5] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [7, 3751.05] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [32, 6741.47] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [17, 11218.5] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [27, 17317.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [27, 22698.8] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [27, 27706.3] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [27, 29548.3] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [29, 7174.89] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [95, 12006.6] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [96, 19163.1] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [39, 29144.0] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [55, 35034.8] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [35, 37956.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [38, 40750.9] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [25, 12988.8] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [11, 19550.2] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [65, 27524.4] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [2, 33781.1] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [37, 36378.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [56, 40192.6] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [58, 41995.9] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [60, 17393.3] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [63, 25542.6] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [40, 32287.2] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [58, 35993.8] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 40278.5] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [42, 42644.7] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [60, 42741.3] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [9, 983.809] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [27, 1747.87] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [9, 2885.34] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [85, 4350.19] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [79, 5848.11] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [70, 7060.63] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [82, 7795.5] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [27, 2216.08] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [17, 4002.85] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [68, 6505.32] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [79, 9543.38] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [7, 12204.6] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [71, 14911.4] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [82, 15818.6] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [17, 3796.32] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [33, 7098.96] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [27, 11779.0] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [27, 16995.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [27, 22810.6] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [7, 27881.7] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [6, 29707.4] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [94, 6461.89] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [98, 12026.7] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [43, 19672.3] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [43, 28391.8] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [46, 35477.6] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 37564.4] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [50, 39385.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [24, 12490.8] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [31, 19632.0] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [47, 27721.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [60, 33755.6] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [56, 36597.3] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [42, 40184.9] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [45, 42014.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [96, 17813.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [62, 25087.4] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [65, 32311.8] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [45, 36053.4] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [37, 40237.1] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [56, 42683.6] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [58, 43264.3] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [51, 21697.0] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [65, 29651.5] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [62, 34229.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [40, 38868.4] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [60, 41956.9] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [54, 43003.1] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [37, 43321.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [0, 1783.29] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [64, 3207.06] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [26, 5506.73] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [32, 8069.19] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [4, 11091.7] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [4, 13544.6] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [6, 14702.6] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [32, 4081.39] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [26, 7329.47] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [7, 12504.8] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [26, 17778.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [26, 23416.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [7, 28229.6] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [16, 29952.6] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [11, 8056.9] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [59, 13443.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [41, 21435.9] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [53, 28453.9] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [57, 35023.9] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [46, 38268.6] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [28, 37941.3] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [23, 12455.2] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [13, 19616.7] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [45, 27865.2] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [47, 33788.1] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [42, 36544.1] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [42, 40250.1] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [47, 41325.5] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [47, 17316.9] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [65, 25113.9] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [52, 32737.3] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [56, 36306.1] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [56, 40172.9] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [37, 42758.9] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [45, 43195.4] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [62, 21694.6] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [62, 29738.0] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [61, 34272.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [48, 38863.2] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [47, 41971.9] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [42, 43036.4] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [37, 42925.6] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [51, 25010.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [49, 30886.0] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [48, 36642.7] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [49, 40414.3] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [45, 42176.5] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [42, 43321.3] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [44, 41457.0] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [4, 2489.94] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [32, 4083.03] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [16, 6885.06] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [26, 10896.0] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [16, 14619.1] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [6, 18105.7] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [20, 19414.2] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [10, 5634.67] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [9, 10540.1] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [17, 17084.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [7, 24790.8] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [17, 31513.9] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [15, 37249.0] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 38655.1] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [48, 10338.8] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [40, 16927.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [60, 24593.2] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [58, 31291.6] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [60, 37382.6] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [40, 39381.7] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [34, 41215.3] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [54, 14514.7] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [61, 22022.8] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [40, 30156.2] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [42, 35698.5] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [60, 38757.6] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [40, 41716.1] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [46, 42775.1] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [61, 19273.1] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [65, 27091.4] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [61, 33915.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [62, 37419.7] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [54, 41089.5] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [60, 43202.6] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [58, 42751.4] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [61, 23254.2] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [56, 30652.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [62, 35241.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [58, 39681.5] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [58, 42476.9] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [42, 42935.5] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [61, 42847.9] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [58, 23153.0] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [60, 30122.8] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [91, 35267.4] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [47, 40597.4] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [47, 41985.1] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [40, 43431.8] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [99, 40503.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH.yaml +new file mode 100644 +index 00000000..2db4bd59 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH.yaml +@@ -0,0 +1,28113 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12416 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12416 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12416 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 24832 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8448 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8448 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 24832 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 95 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 96 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 97 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 98 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 99 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 100 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [1, 36.4646] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [18, 61.6157] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [16, 106.357] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [57, 165.953] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [56, 234.804] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [57, 302.751] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 342.009] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [27, 66.9845] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [16, 122.971] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [50, 212.456] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [10, 333.729] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [56, 475.114] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 603.736] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [56, 678.073] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [30, 152.987] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [9, 279.807] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [56, 462.133] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [56, 729.254] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [57, 1005.11] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 1263.73] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [87, 1426.73] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [86, 327.937] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [56, 598.93] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [69, 1017.66] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [56, 1569.43] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [95, 2130.17] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 2585.39] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [56, 2898.37] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [69, 678.363] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [79, 1226.4] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [15, 2020.87] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [79, 3073.6] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [79, 4307.93] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 5174.17] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 5820.88] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [95, 1267.74] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [86, 2280.75] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [79, 3761.3] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [95, 5872.85] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [69, 8268.72] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [62, 10099.5] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 11490.8] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [32, 2119.67] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [2, 3783.77] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [33, 6294.81] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [47, 9922.23] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [21, 13753.9] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [50, 17211.9] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [21, 18627.1] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [4, 64.0156] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [27, 122.943] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [27, 212.37] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [79, 326.837] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [57, 482.27] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [63, 600.667] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [96, 681.212] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [25, 182.742] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [31, 334.154] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [96, 534.715] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [87, 821.929] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [63, 1099.14] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [87, 1311.03] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [63, 1468.5] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [31, 395.316] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [29, 697.076] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [87, 1106.39] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [56, 1663.09] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [96, 2274.88] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [87, 2670.79] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [96, 2961.36] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [57, 807.529] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [87, 1425.43] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [10, 2310.91] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [69, 3390.72] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [87, 4593.04] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [57, 5393.53] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [69, 5953.86] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [86, 1467.31] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [80, 2616.13] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [57, 4378.2] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [70, 6425.61] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [70, 8702.45] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [87, 10634.9] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [57, 11760.6] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [56, 2577.15] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [73, 4849.64] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [69, 7956.94] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [70, 12220.5] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [96, 16674.0] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [96, 20479.6] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [95, 23351.4] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [56, 4976.19] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [54, 9346.64] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [19, 14421.2] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [97, 22911.9] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 30915.1] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 36468.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [2, 38064.5] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [2, 162.823] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [15, 268.419] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [56, 460.356] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [56, 746.185] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [56, 1014.71] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [96, 1275.69] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [69, 1422.28] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [43, 391.698] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [31, 694.306] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [56, 1155.94] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [56, 1719.68] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 2227.02] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [79, 2702.74] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [95, 2980.24] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [96, 754.782] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [69, 1347.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [69, 2257.73] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [86, 3482.93] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 4517.6] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [96, 5444.5] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [86, 6009.98] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [28, 1443.08] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [95, 2505.93] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [87, 4221.21] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [87, 6617.58] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [56, 8807.54] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [96, 10603.8] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [80, 11754.9] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [11, 2571.61] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [12, 4605.96] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [69, 7776.23] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [69, 12336.2] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [70, 16512.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [86, 20510.8] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [95, 23165.5] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [95, 4740.67] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [35, 8823.15] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [24, 14671.9] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [38, 22021.0] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [88, 29947.7] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 36415.3] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [2, 38295.3] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [86, 9478.65] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [89, 15957.4] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [19, 22608.9] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [76, 29594.4] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [61, 35018.7] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [91, 37462.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [99, 40120.4] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [14, 251.017] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [26, 431.394] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [98, 720.836] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [56, 1145.88] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [56, 1550.19] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [56, 1936.95] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [56, 2150.38] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [87, 596.689] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [87, 1058.64] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [13, 1729.85] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [69, 2614.63] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [87, 3348.52] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [62, 4071.32] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [69, 4475.92] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [57, 1101.83] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [70, 2065.83] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [56, 3419.27] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [70, 5109.31] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [86, 6658.08] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [87, 8087.04] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [87, 8924.45] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [70, 1954.78] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [63, 3559.52] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [63, 6024.14] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [57, 9550.6] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [70, 12682.8] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [80, 15623.1] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [79, 17438.4] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [96, 4041.42] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [42, 7122.09] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [53, 11299.1] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [23, 17297.6] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [5, 22698.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [53, 27014.7] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [51, 28693.8] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [87, 7737.38] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [89, 13477.5] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [52, 20140.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [76, 27226.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [75, 34287.3] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [75, 36698.3] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 39379.8] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [94, 12294.0] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [92, 19603.4] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [67, 26502.6] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [3, 32352.0] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 35150.1] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [5, 38319.7] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [0, 39902.5] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [3, 521.336] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [69, 911.937] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [95, 1548.48] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [69, 2366.99] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [95, 3198.91] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [86, 3854.62] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [95, 4325.24] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [56, 1136.87] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [70, 2036.41] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [17, 3333.67] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [56, 4946.6] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [87, 6696.17] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [87, 7940.01] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [70, 8809.26] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [57, 1955.69] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [56, 3699.22] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [40, 6082.41] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [62, 9286.28] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [87, 12858.6] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [62, 15355.6] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 17321.8] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [69, 3629.35] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [41, 7139.24] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [8, 10793.8] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [71, 16843.2] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [46, 22742.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [2, 26940.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 28935.1] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [86, 8121.94] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [89, 13502.8] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [91, 20195.3] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [91, 27929.1] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [91, 33825.1] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [91, 36410.4] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [91, 39323.6] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [84, 12286.5] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [35, 19217.9] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [50, 25969.3] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [75, 33185.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [75, 35955.2] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [75, 39112.5] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [75, 41087.1] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [90, 17225.1] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [93, 24667.9] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [93, 31551.0] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [91, 35398.1] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [91, 38977.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [75, 41188.8] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [75, 41797.4] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [3, 962.585] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [95, 1767.76] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [79, 3030.94] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [56, 4549.96] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [79, 6253.55] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [86, 7565.26] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [80, 8153.03] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [30, 1928.43] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [62, 3691.63] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [62, 6226.1] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [86, 9224.16] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [69, 12645.3] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [96, 15299.0] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [70, 17380.5] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [48, 3743.81] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [34, 6733.33] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [71, 11227.2] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 16777.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [88, 22415.7] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [3, 27039.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 28858.4] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [95, 8089.34] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [72, 13493.8] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [82, 19548.2] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [91, 28024.4] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 33843.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [91, 36464.0] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [60, 39425.1] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [78, 12372.6] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [36, 19315.7] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [36, 26011.2] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [75, 33351.6] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 36068.0] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [75, 39152.5] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [75, 41098.1] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [78, 16872.9] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [78, 24683.0] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [93, 31600.6] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [75, 35412.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 39036.7] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [75, 41217.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [75, 41741.7] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [98, 21902.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [36, 29299.9] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [36, 33728.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [75, 38410.1] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 40913.8] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [75, 41475.7] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [61, 41726.0] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [48, 1550.39] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [33, 3030.93] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [26, 5128.05] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [21, 7643.38] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [50, 10644.3] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [9, 12953.7] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [21, 14306.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [55, 3807.81] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [9, 6232.25] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [52, 10402.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [37, 17426.3] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [6, 22940.6] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [39, 27226.6] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [45, 28950.2] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [86, 7738.57] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [77, 13586.6] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [35, 20181.1] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [100, 27539.4] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [76, 34329.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 36726.7] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [82, 39650.8] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [77, 11863.7] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [77, 18996.7] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [50, 26614.7] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [76, 32837.5] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [75, 35822.5] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 39309.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [91, 40947.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [74, 17251.6] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [93, 25246.3] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [20, 31507.8] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [75, 35200.1] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [75, 39209.7] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [75, 41255.5] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [60, 41762.8] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [81, 21704.1] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [85, 29683.3] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [82, 33793.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [99, 38315.6] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [75, 40925.8] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 41684.5] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [75, 41418.4] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [64, 6641.44] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [65, 12983.6] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [65, 25239.4] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [63, 38579.9] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [67, 40575.1] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [75, 41624.3] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [76, 41686.9] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [9, 2286.04] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [7, 4202.71] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [33, 7061.14] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [47, 11003.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [14, 14683.8] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [3, 17541.4] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [2, 18721.0] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [15, 4943.93] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [49, 9607.64] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [35, 14687.9] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [97, 22601.3] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [5, 30625.9] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [3, 36577.9] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [2, 38162.6] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [69, 9396.37] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [72, 16176.7] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [35, 22869.0] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [66, 30585.8] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [91, 35786.7] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [75, 37887.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [99, 40129.4] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [52, 14114.8] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [59, 21877.4] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [83, 28425.4] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [60, 35024.4] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [91, 37284.7] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [75, 40275.0] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [60, 41785.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [35, 18693.3] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [85, 26867.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [35, 33306.7] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [60, 36590.3] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [60, 39935.4] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [75, 41775.6] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [35, 41003.6] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [90, 22677.7] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [68, 30743.4] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [50, 34783.9] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [52, 38051.7] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [56, 40550.6] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [61, 41121.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 42286.6] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [58, 6883.54] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [58, 13780.3] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [65, 26062.2] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [63, 39162.5] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [3, 40101.9] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [76, 41557.7] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 41818.3] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH_GB.yaml +new file mode 100644 +index 00000000..640c8106 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_I8II_BH_GB.yaml +@@ -0,0 +1,28113 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12416 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12416 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 2 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12416 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2304 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 16 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 1 ++ LVPA: 1 ++ LVPB: 2 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2048 ++ LdsNumElementsAlignedB: 2560 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2048 ++ LdsOffsetB_Blk: 10240 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 24832 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8448 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 8448 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 24832 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 4352 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 2 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 95 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 96 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 97 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 98 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 99 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4096 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4096 ++ LdsOffsetB_Blk: 20480 ++ LdsPadA: 0 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 100 ++ SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [1, 36.4646] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [18, 61.6157] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [16, 106.357] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [57, 165.953] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [56, 234.804] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [57, 302.751] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 342.009] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [27, 66.9845] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [16, 122.971] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [50, 212.456] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [10, 333.729] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [56, 475.114] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 603.736] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [56, 678.073] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [30, 152.987] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [9, 279.807] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [56, 462.133] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [56, 729.254] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [57, 1005.11] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 1263.73] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [87, 1426.73] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [86, 327.937] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [56, 598.93] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [69, 1017.66] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [56, 1569.43] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [95, 2130.17] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 2585.39] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [56, 2898.37] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [69, 678.363] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [79, 1226.4] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [15, 2020.87] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [79, 3073.6] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [79, 4307.93] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 5174.17] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 5820.88] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [95, 1267.74] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [86, 2280.75] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [79, 3761.3] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [95, 5872.85] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [69, 8268.72] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [62, 10099.5] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 11490.8] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [32, 2119.67] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [2, 3783.77] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [33, 6294.81] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [47, 9922.23] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [21, 13753.9] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [50, 17211.9] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [21, 18627.1] ++ - - [128, 64, 1, 64, 160, 160, 128, 96] ++ - [4, 64.0156] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [27, 122.943] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [27, 212.37] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [79, 326.837] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [57, 482.27] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [63, 600.667] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [96, 681.212] ++ - - [128, 128, 1, 64, 160, 160, 128, 96] ++ - [25, 182.742] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [31, 334.154] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [96, 534.715] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [87, 821.929] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [63, 1099.14] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [87, 1311.03] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [63, 1468.5] ++ - - [128, 256, 1, 64, 160, 160, 128, 96] ++ - [31, 395.316] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [29, 697.076] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [87, 1106.39] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [56, 1663.09] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [96, 2274.88] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [87, 2670.79] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [96, 2961.36] ++ - - [128, 512, 1, 64, 160, 160, 128, 96] ++ - [57, 807.529] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [87, 1425.43] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [10, 2310.91] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [69, 3390.72] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [87, 4593.04] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [57, 5393.53] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [69, 5953.86] ++ - - [128, 1024, 1, 64, 160, 160, 128, 96] ++ - [86, 1467.31] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [80, 2616.13] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [57, 4378.2] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [70, 6425.61] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [70, 8702.45] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [87, 10634.9] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [57, 11760.6] ++ - - [128, 2048, 1, 64, 160, 160, 128, 96] ++ - [56, 2577.15] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [73, 4849.64] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [69, 7956.94] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [70, 12220.5] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [96, 16674.0] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [96, 20479.6] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [95, 23351.4] ++ - - [128, 4096, 1, 64, 160, 160, 128, 96] ++ - [56, 4976.19] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [54, 9346.64] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [19, 14421.2] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [97, 22911.9] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [52, 30915.1] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 36468.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [2, 38064.5] ++ - - [256, 64, 1, 64, 288, 288, 256, 96] ++ - [2, 162.823] ++ - - [256, 64, 1, 128, 288, 288, 256, 160] ++ - [15, 268.419] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [56, 460.356] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [56, 746.185] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [56, 1014.71] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [96, 1275.69] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [69, 1422.28] ++ - - [256, 128, 1, 64, 288, 288, 256, 96] ++ - [43, 391.698] ++ - - [256, 128, 1, 128, 288, 288, 256, 160] ++ - [31, 694.306] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [56, 1155.94] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [56, 1719.68] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 2227.02] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [79, 2702.74] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [95, 2980.24] ++ - - [256, 256, 1, 64, 288, 288, 256, 96] ++ - [96, 754.782] ++ - - [256, 256, 1, 128, 288, 288, 256, 160] ++ - [69, 1347.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [69, 2257.73] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [86, 3482.93] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 4517.6] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [96, 5444.5] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [86, 6009.98] ++ - - [256, 512, 1, 64, 288, 288, 256, 96] ++ - [28, 1443.08] ++ - - [256, 512, 1, 128, 288, 288, 256, 160] ++ - [95, 2505.93] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [87, 4221.21] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [87, 6617.58] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [56, 8807.54] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [96, 10603.8] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [80, 11754.9] ++ - - [256, 1024, 1, 64, 288, 288, 256, 96] ++ - [11, 2571.61] ++ - - [256, 1024, 1, 128, 288, 288, 256, 160] ++ - [12, 4605.96] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [69, 7776.23] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [69, 12336.2] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [70, 16512.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [86, 20510.8] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [95, 23165.5] ++ - - [256, 2048, 1, 64, 288, 288, 256, 96] ++ - [95, 4740.67] ++ - - [256, 2048, 1, 128, 288, 288, 256, 160] ++ - [35, 8823.15] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [24, 14671.9] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [38, 22021.0] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [88, 29947.7] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [44, 36415.3] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [2, 38295.3] ++ - - [256, 4096, 1, 64, 288, 288, 256, 96] ++ - [86, 9478.65] ++ - - [256, 4096, 1, 128, 288, 288, 256, 160] ++ - [89, 15957.4] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [19, 22608.9] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [76, 29594.4] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [61, 35018.7] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [91, 37462.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [99, 40120.4] ++ - - [384, 64, 1, 64, 416, 416, 384, 96] ++ - [14, 251.017] ++ - - [384, 64, 1, 128, 416, 416, 384, 160] ++ - [26, 431.394] ++ - - [384, 64, 1, 256, 416, 416, 384, 288] ++ - [98, 720.836] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [56, 1145.88] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [56, 1550.19] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [56, 1936.95] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [56, 2150.38] ++ - - [384, 128, 1, 64, 416, 416, 384, 96] ++ - [87, 596.689] ++ - - [384, 128, 1, 128, 416, 416, 384, 160] ++ - [87, 1058.64] ++ - - [384, 128, 1, 256, 416, 416, 384, 288] ++ - [13, 1729.85] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [69, 2614.63] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [87, 3348.52] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [62, 4071.32] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [69, 4475.92] ++ - - [384, 256, 1, 64, 416, 416, 384, 96] ++ - [57, 1101.83] ++ - - [384, 256, 1, 128, 416, 416, 384, 160] ++ - [70, 2065.83] ++ - - [384, 256, 1, 256, 416, 416, 384, 288] ++ - [56, 3419.27] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [70, 5109.31] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [86, 6658.08] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [87, 8087.04] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [87, 8924.45] ++ - - [384, 512, 1, 64, 416, 416, 384, 96] ++ - [70, 1954.78] ++ - - [384, 512, 1, 128, 416, 416, 384, 160] ++ - [63, 3559.52] ++ - - [384, 512, 1, 256, 416, 416, 384, 288] ++ - [63, 6024.14] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [57, 9550.6] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [70, 12682.8] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [80, 15623.1] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [79, 17438.4] ++ - - [384, 1024, 1, 64, 416, 416, 384, 96] ++ - [96, 4041.42] ++ - - [384, 1024, 1, 128, 416, 416, 384, 160] ++ - [42, 7122.09] ++ - - [384, 1024, 1, 256, 416, 416, 384, 288] ++ - [53, 11299.1] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [23, 17297.6] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [5, 22698.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [53, 27014.7] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [51, 28693.8] ++ - - [384, 2048, 1, 64, 416, 416, 384, 96] ++ - [87, 7737.38] ++ - - [384, 2048, 1, 128, 416, 416, 384, 160] ++ - [89, 13477.5] ++ - - [384, 2048, 1, 256, 416, 416, 384, 288] ++ - [52, 20140.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [76, 27226.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [75, 34287.3] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [75, 36698.3] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 39379.8] ++ - - [384, 4096, 1, 64, 416, 416, 384, 96] ++ - [94, 12294.0] ++ - - [384, 4096, 1, 128, 416, 416, 384, 160] ++ - [92, 19603.4] ++ - - [384, 4096, 1, 256, 416, 416, 384, 288] ++ - [67, 26502.6] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [3, 32352.0] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 35150.1] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [5, 38319.7] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [0, 39902.5] ++ - - [768, 64, 1, 64, 800, 800, 768, 96] ++ - [3, 521.336] ++ - - [768, 64, 1, 128, 800, 800, 768, 160] ++ - [69, 911.937] ++ - - [768, 64, 1, 256, 800, 800, 768, 288] ++ - [95, 1548.48] ++ - - [768, 64, 1, 512, 800, 800, 768, 544] ++ - [69, 2366.99] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [95, 3198.91] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [86, 3854.62] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [95, 4325.24] ++ - - [768, 128, 1, 64, 800, 800, 768, 96] ++ - [56, 1136.87] ++ - - [768, 128, 1, 128, 800, 800, 768, 160] ++ - [70, 2036.41] ++ - - [768, 128, 1, 256, 800, 800, 768, 288] ++ - [17, 3333.67] ++ - - [768, 128, 1, 512, 800, 800, 768, 544] ++ - [56, 4946.6] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [87, 6696.17] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [87, 7940.01] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [70, 8809.26] ++ - - [768, 256, 1, 64, 800, 800, 768, 96] ++ - [57, 1955.69] ++ - - [768, 256, 1, 128, 800, 800, 768, 160] ++ - [56, 3699.22] ++ - - [768, 256, 1, 256, 800, 800, 768, 288] ++ - [40, 6082.41] ++ - - [768, 256, 1, 512, 800, 800, 768, 544] ++ - [62, 9286.28] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [87, 12858.6] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [62, 15355.6] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 17321.8] ++ - - [768, 512, 1, 64, 800, 800, 768, 96] ++ - [69, 3629.35] ++ - - [768, 512, 1, 128, 800, 800, 768, 160] ++ - [41, 7139.24] ++ - - [768, 512, 1, 256, 800, 800, 768, 288] ++ - [8, 10793.8] ++ - - [768, 512, 1, 512, 800, 800, 768, 544] ++ - [71, 16843.2] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [46, 22742.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [2, 26940.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 28935.1] ++ - - [768, 1024, 1, 64, 800, 800, 768, 96] ++ - [86, 8121.94] ++ - - [768, 1024, 1, 128, 800, 800, 768, 160] ++ - [89, 13502.8] ++ - - [768, 1024, 1, 256, 800, 800, 768, 288] ++ - [91, 20195.3] ++ - - [768, 1024, 1, 512, 800, 800, 768, 544] ++ - [91, 27929.1] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [91, 33825.1] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [91, 36410.4] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [91, 39323.6] ++ - - [768, 2048, 1, 64, 800, 800, 768, 96] ++ - [84, 12286.5] ++ - - [768, 2048, 1, 128, 800, 800, 768, 160] ++ - [35, 19217.9] ++ - - [768, 2048, 1, 256, 800, 800, 768, 288] ++ - [50, 25969.3] ++ - - [768, 2048, 1, 512, 800, 800, 768, 544] ++ - [75, 33185.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [75, 35955.2] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [75, 39112.5] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [75, 41087.1] ++ - - [768, 4096, 1, 64, 800, 800, 768, 96] ++ - [90, 17225.1] ++ - - [768, 4096, 1, 128, 800, 800, 768, 160] ++ - [93, 24667.9] ++ - - [768, 4096, 1, 256, 800, 800, 768, 288] ++ - [93, 31551.0] ++ - - [768, 4096, 1, 512, 800, 800, 768, 544] ++ - [91, 35398.1] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [91, 38977.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [75, 41188.8] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [75, 41797.4] ++ - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] ++ - [3, 962.585] ++ - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] ++ - [95, 1767.76] ++ - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] ++ - [79, 3030.94] ++ - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] ++ - [56, 4549.96] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] ++ - [79, 6253.55] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [86, 7565.26] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [80, 8153.03] ++ - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] ++ - [30, 1928.43] ++ - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] ++ - [62, 3691.63] ++ - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] ++ - [62, 6226.1] ++ - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] ++ - [86, 9224.16] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] ++ - [69, 12645.3] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [96, 15299.0] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [70, 17380.5] ++ - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] ++ - [48, 3743.81] ++ - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] ++ - [34, 6733.33] ++ - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] ++ - [71, 11227.2] ++ - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] ++ - [44, 16777.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] ++ - [88, 22415.7] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [3, 27039.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 28858.4] ++ - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] ++ - [95, 8089.34] ++ - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] ++ - [72, 13493.8] ++ - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] ++ - [82, 19548.2] ++ - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] ++ - [91, 28024.4] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 33843.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [91, 36464.0] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [60, 39425.1] ++ - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] ++ - [78, 12372.6] ++ - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] ++ - [36, 19315.7] ++ - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] ++ - [36, 26011.2] ++ - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] ++ - [75, 33351.6] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 36068.0] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [75, 39152.5] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [75, 41098.1] ++ - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] ++ - [78, 16872.9] ++ - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] ++ - [78, 24683.0] ++ - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] ++ - [93, 31600.6] ++ - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] ++ - [75, 35412.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 39036.7] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [75, 41217.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [75, 41741.7] ++ - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] ++ - [98, 21902.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] ++ - [36, 29299.9] ++ - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] ++ - [36, 33728.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] ++ - [75, 38410.1] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] ++ - [75, 40913.8] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [75, 41475.7] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [61, 41726.0] ++ - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] ++ - [48, 1550.39] ++ - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] ++ - [33, 3030.93] ++ - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] ++ - [26, 5128.05] ++ - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] ++ - [21, 7643.38] ++ - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] ++ - [50, 10644.3] ++ - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] ++ - [9, 12953.7] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [21, 14306.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] ++ - [55, 3807.81] ++ - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] ++ - [9, 6232.25] ++ - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] ++ - [52, 10402.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] ++ - [37, 17426.3] ++ - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] ++ - [6, 22940.6] ++ - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] ++ - [39, 27226.6] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [45, 28950.2] ++ - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] ++ - [86, 7738.57] ++ - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] ++ - [77, 13586.6] ++ - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] ++ - [35, 20181.1] ++ - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] ++ - [100, 27539.4] ++ - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] ++ - [76, 34329.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 36726.7] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [82, 39650.8] ++ - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] ++ - [77, 11863.7] ++ - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] ++ - [77, 18996.7] ++ - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] ++ - [50, 26614.7] ++ - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] ++ - [76, 32837.5] ++ - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] ++ - [75, 35822.5] ++ - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 39309.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [91, 40947.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] ++ - [74, 17251.6] ++ - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] ++ - [93, 25246.3] ++ - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] ++ - [20, 31507.8] ++ - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] ++ - [75, 35200.1] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] ++ - [75, 39209.7] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] ++ - [75, 41255.5] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [60, 41762.8] ++ - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] ++ - [81, 21704.1] ++ - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] ++ - [85, 29683.3] ++ - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] ++ - [82, 33793.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] ++ - [99, 38315.6] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] ++ - [75, 40925.8] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] ++ - [60, 41684.5] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [75, 41418.4] ++ - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] ++ - [64, 6641.44] ++ - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] ++ - [65, 12983.6] ++ - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] ++ - [65, 25239.4] ++ - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] ++ - [63, 38579.9] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] ++ - [67, 40575.1] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] ++ - [75, 41624.3] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [76, 41686.9] ++ - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] ++ - [9, 2286.04] ++ - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] ++ - [7, 4202.71] ++ - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] ++ - [33, 7061.14] ++ - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] ++ - [47, 11003.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] ++ - [14, 14683.8] ++ - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] ++ - [3, 17541.4] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [2, 18721.0] ++ - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] ++ - [15, 4943.93] ++ - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] ++ - [49, 9607.64] ++ - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] ++ - [35, 14687.9] ++ - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] ++ - [97, 22601.3] ++ - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] ++ - [5, 30625.9] ++ - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] ++ - [3, 36577.9] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [2, 38162.6] ++ - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] ++ - [69, 9396.37] ++ - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] ++ - [72, 16176.7] ++ - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] ++ - [35, 22869.0] ++ - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] ++ - [66, 30585.8] ++ - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] ++ - [91, 35786.7] ++ - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] ++ - [75, 37887.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [99, 40129.4] ++ - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] ++ - [52, 14114.8] ++ - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] ++ - [59, 21877.4] ++ - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] ++ - [83, 28425.4] ++ - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] ++ - [60, 35024.4] ++ - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] ++ - [91, 37284.7] ++ - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] ++ - [75, 40275.0] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [60, 41785.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] ++ - [35, 18693.3] ++ - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] ++ - [85, 26867.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] ++ - [35, 33306.7] ++ - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] ++ - [60, 36590.3] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] ++ - [60, 39935.4] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] ++ - [75, 41775.6] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [35, 41003.6] ++ - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] ++ - [90, 22677.7] ++ - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] ++ - [68, 30743.4] ++ - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] ++ - [50, 34783.9] ++ - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] ++ - [52, 38051.7] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] ++ - [56, 40550.6] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] ++ - [61, 41121.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 42286.6] ++ - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] ++ - [58, 6883.54] ++ - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] ++ - [58, 13780.3] ++ - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] ++ - [65, 26062.2] ++ - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] ++ - [63, 39162.5] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] ++ - [3, 40101.9] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] ++ - [76, 41557.7] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 41818.3] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_SB.yaml +new file mode 100644 +index 00000000..e1e8b1c1 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Ailk_Bljk_SB.yaml +@@ -0,0 +1,310 @@ ++- {MinimumRequiredVersion: 4.33.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 8 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: false ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 1 ++ GlobalLoadVectorWidthB: 1 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 1 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 8 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 32 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 512 ++ LdsOffsetA: 0 ++ LdsOffsetB: 256 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 1 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 8 ++ LoopTail: true ++ LoopUnroll: 8 ++ MACInstruction: FMA ++ MIArchVgpr: false ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstruction: [] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: false ++ PrefetchLocalRead: true ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [0, 3, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 1 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: true ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: false ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_ ++ SourceSwap: false ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [2, 2] ++ ThreadTile0: 2 ++ ThreadTile1: 2 ++ ThreadTileA: 2 ++ ThreadTileB: 2 ++ TransposeLDS: 0 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: 0 ++ UnrollMajorLDSB: 0 ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 8 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [126, 126, 2, 66, 126, 126, 126, 66] ++ - [0, 0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH.yaml +new file mode 100644 +index 00000000..3b94bbd5 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH.yaml +@@ -0,0 +1,21903 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [6, 31.3419] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [13, 53.7456] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [7, 93.4143] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [69, 146.041] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [64, 210.643] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 274.093] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 305.224] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [2, 56.1757] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [13, 103.758] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [13, 181.713] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [69, 299.529] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [64, 427.011] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 542.794] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 612.274] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [49, 134.519] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [6, 246.173] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [69, 422.09] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [69, 666.45] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [77, 932.586] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [73, 1148.38] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [64, 1273.73] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [46, 278.58] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [6, 510.504] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [20, 870.007] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [77, 1369.79] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [65, 1930.19] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [73, 2450.93] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [71, 2638.29] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [22, 579.724] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [77, 1053.98] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [77, 1859.59] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [69, 2902.88] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [69, 4025.49] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 4916.23] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [76, 5422.94] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [46, 1229.65] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [21, 2227.16] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [20, 3710.96] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [68, 5620.51] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [64, 7861.41] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [66, 9902.09] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [73, 9588.87] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [58, 2126.93] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [21, 3867.96] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [46, 6570.3] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [5, 10059.1] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [56, 14007.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [21, 17248.3] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 14709.2] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [10, 66.475] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [13, 116.405] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [6, 186.812] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [6, 292.246] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [64, 418.656] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [66, 539.218] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [66, 607.452] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [3, 139.847] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [46, 265.867] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [46, 450.662] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [7, 688.155] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [69, 958.698] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [69, 1167.31] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [66, 1261.47] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [8, 325.746] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [46, 588.426] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [13, 1019.4] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [46, 1515.01] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [13, 1995.98] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [64, 2391.45] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [74, 2590.03] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [22, 728.81] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [22, 1304.4] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [22, 2133.42] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [74, 3046.25] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [74, 4068.44] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [71, 4851.89] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [76, 5304.0] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [22, 1411.51] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [13, 2529.74] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [22, 4143.03] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [77, 6194.28] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [77, 8205.52] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [71, 9977.9] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [77, 10612.6] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [13, 2663.9] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [58, 4954.15] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [13, 8181.04] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [69, 12073.2] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [70, 16273.8] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [66, 19675.0] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [63, 21060.4] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [58, 4735.98] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [6, 8661.44] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [55, 14341.1] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [6, 21721.6] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [33, 28980.9] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [45, 35093.1] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [45, 33039.3] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [2, 142.141] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [21, 250.736] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [6, 408.125] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [66, 643.002] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [66, 917.088] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [66, 1139.83] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [64, 1242.11] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [46, 308.77] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [22, 587.191] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [46, 1020.52] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [22, 1517.48] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 2000.74] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [74, 2370.92] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [66, 2572.11] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [58, 725.784] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [22, 1304.6] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [13, 2125.58] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [69, 3035.5] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 4017.29] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [63, 4810.84] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [64, 5195.3] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [46, 1486.55] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [44, 2648.78] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [46, 4331.84] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [66, 6314.94] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [74, 8129.97] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [64, 9840.73] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [63, 10545.1] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [58, 2796.69] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [46, 4976.93] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [7, 8191.0] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [13, 12053.7] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [74, 16096.1] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [76, 19227.5] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [66, 20928.2] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [59, 4436.07] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [20, 8234.22] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [12, 14061.6] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [12, 21365.4] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [11, 27006.9] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [12, 34570.0] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [31, 37368.4] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [32, 7932.49] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [38, 14082.2] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [16, 21178.4] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [18, 27895.2] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [38, 33316.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [3, 35361.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [35, 36518.4] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [42, 228.084] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [6, 402.27] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [46, 650.82] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [69, 1008.89] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 1379.02] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [69, 1707.55] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 1885.58] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [34, 496.798] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [46, 947.227] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [7, 1500.65] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [69, 2297.2] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [66, 3037.33] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [22, 3580.67] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [63, 3860.61] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [22, 1047.01] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [22, 1884.52] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [34, 3214.44] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [46, 4711.82] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 6166.97] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [66, 7336.17] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 7785.41] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [46, 2042.35] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [7, 3652.51] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [22, 6018.37] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [69, 9206.45] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [64, 12175.1] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [74, 14334.9] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 15369.6] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [45, 3663.68] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [22, 6545.93] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [57, 10981.1] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [20, 16102.3] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 21679.5] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [57, 26211.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 28380.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [49, 6213.02] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [39, 10702.0] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [39, 18051.4] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [49, 24073.5] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [62, 30606.0] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [26, 34036.7] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 37099.4] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [51, 10422.8] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [42, 16622.1] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [42, 23483.8] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [51, 30116.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [2, 33413.8] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [2, 36691.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [16, 37313.6] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [39, 454.782] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [6, 810.128] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [46, 1319.24] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [66, 2038.54] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [71, 2794.96] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [77, 3451.87] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [66, 3760.23] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [22, 1034.44] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [46, 1958.73] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [58, 3195.66] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [46, 4692.49] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [77, 6136.14] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [63, 7270.21] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [64, 7702.75] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [56, 2101.71] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [46, 3774.12] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [13, 6218.43] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [74, 9186.28] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [69, 12130.3] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [71, 14302.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [77, 15585.0] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [38, 3626.2] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [31, 6595.68] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [19, 10682.7] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [20, 15968.2] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [21, 21041.7] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [58, 26153.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 28365.9] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [51, 5971.27] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [39, 10776.5] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [54, 18203.2] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [60, 24603.0] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [62, 31483.1] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 34001.4] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [59, 37627.6] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [15, 10474.8] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [30, 16803.8] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [27, 24098.0] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [39, 31332.4] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [18, 34262.5] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [3, 37580.7] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [18, 39058.2] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [47, 13920.1] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [16, 21061.5] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [51, 28053.6] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [40, 32705.5] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [16, 37264.6] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [18, 39923.5] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 40574.3] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [46, 963.47] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [22, 1623.18] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [58, 2724.16] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [5, 4143.21] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [69, 5591.48] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [69, 6922.48] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [64, 7706.44] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [42, 1887.63] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [22, 3597.17] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [22, 5963.48] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [46, 8840.97] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [64, 11817.7] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [74, 14597.9] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [64, 15648.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [55, 3489.92] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [45, 6316.72] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [33, 10590.6] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [45, 16378.7] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [46, 21949.0] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [45, 26374.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [43, 28173.3] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [24, 6168.85] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [24, 10743.1] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [42, 17652.5] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [28, 24610.6] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [24, 31603.0] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 34763.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [47, 37426.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [38, 10887.3] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [39, 16999.6] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [41, 24244.5] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [39, 31010.3] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [39, 34354.6] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [18, 37523.8] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 39432.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [47, 13686.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [42, 20792.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [54, 28334.9] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [16, 32858.9] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [54, 37154.2] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [18, 39958.7] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 40498.2] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [54, 16087.5] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [39, 23586.3] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [39, 29628.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [39, 35542.2] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [54, 39207.7] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [16, 40647.0] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 41058.2] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [19, 1678.62] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [55, 2911.7] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [57, 5019.12] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [19, 7567.53] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [6, 10735.7] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [4, 12655.7] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [75, 10298.3] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [55, 3220.61] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [19, 6127.56] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [33, 10076.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [19, 14908.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [20, 21530.0] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 26064.2] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [71, 21717.2] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [24, 5793.24] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [60, 11558.5] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [49, 19328.6] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [26, 26360.3] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [47, 31997.2] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [23, 34932.9] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 36577.8] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [38, 9776.01] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [27, 17366.3] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [39, 24780.2] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [51, 31003.2] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [18, 34266.9] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 37733.5] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 38526.8] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [39, 13986.8] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [42, 21180.0] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [51, 28100.6] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [18, 33081.0] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [18, 37274.5] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 39924.7] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [0, 39642.0] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [39, 16096.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [39, 23578.0] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [16, 29621.6] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [42, 35377.9] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [18, 39180.5] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [18, 40769.0] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [52, 40033.8] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [39, 16953.1] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [51, 24535.6] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [54, 31692.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [39, 36868.0] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [51, 39602.3] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [53, 40759.8] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 38496.4] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [38, 1810.23] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [12, 3632.24] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [1, 5986.52] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [50, 9024.89] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [18, 12521.5] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 15810.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [67, 14132.5] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [39, 5069.41] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [61, 8922.89] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [30, 14676.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [16, 21728.7] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [39, 29012.2] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [2, 34178.2] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [72, 28705.1] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [36, 8789.66] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [50, 14625.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [37, 21662.0] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [29, 28523.6] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [2, 33504.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 35874.5] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [14, 36589.4] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [25, 11798.3] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [16, 18593.6] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [39, 25839.7] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [16, 32826.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [39, 35634.0] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 38853.7] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [61, 38321.1] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [42, 14780.9] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [16, 22119.9] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [39, 28047.5] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [42, 34050.3] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [16, 38228.2] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 40709.9] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [42, 39644.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [42, 16864.2] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [39, 23391.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [51, 30651.7] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [39, 36170.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [16, 39707.2] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 40750.7] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [38, 40255.2] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [46, 17374.8] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [42, 24761.4] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [39, 31607.4] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [39, 37076.6] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [51, 39402.8] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [18, 41023.1] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [59, 37774.0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH_GB.yaml +new file mode 100644 +index 00000000..2cd8a55e +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_BBS_BH_GB.yaml +@@ -0,0 +1,21903 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [6, 31.3419] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [13, 53.7456] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [7, 93.4143] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [69, 146.041] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [64, 210.643] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 274.093] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [69, 305.224] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [2, 56.1757] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [13, 103.758] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [13, 181.713] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [69, 299.529] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [64, 427.011] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 542.794] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 612.274] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [49, 134.519] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [6, 246.173] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [69, 422.09] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [69, 666.45] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [77, 932.586] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [73, 1148.38] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [64, 1273.73] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [46, 278.58] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [6, 510.504] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [20, 870.007] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [77, 1369.79] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [65, 1930.19] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [73, 2450.93] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [71, 2638.29] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [22, 579.724] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [77, 1053.98] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [77, 1859.59] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [69, 2902.88] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [69, 4025.49] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [69, 4916.23] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [76, 5422.94] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [46, 1229.65] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [21, 2227.16] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [20, 3710.96] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [68, 5620.51] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [64, 7861.41] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [66, 9902.09] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [73, 9588.87] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [58, 2126.93] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [21, 3867.96] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [46, 6570.3] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [5, 10059.1] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [56, 14007.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [21, 17248.3] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [77, 14709.2] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [10, 66.475] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [13, 116.405] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [6, 186.812] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [6, 292.246] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [64, 418.656] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [66, 539.218] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [66, 607.452] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [3, 139.847] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [46, 265.867] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [46, 450.662] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [7, 688.155] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [69, 958.698] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [69, 1167.31] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [66, 1261.47] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [8, 325.746] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [46, 588.426] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [13, 1019.4] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [46, 1515.01] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [13, 1995.98] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [64, 2391.45] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [74, 2590.03] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [22, 728.81] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [22, 1304.4] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [22, 2133.42] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [74, 3046.25] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [74, 4068.44] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [71, 4851.89] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [76, 5304.0] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [22, 1411.51] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [13, 2529.74] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [22, 4143.03] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [77, 6194.28] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [77, 8205.52] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [71, 9977.9] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [77, 10612.6] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [13, 2663.9] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [58, 4954.15] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [13, 8181.04] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [69, 12073.2] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [70, 16273.8] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [66, 19675.0] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [63, 21060.4] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [58, 4735.98] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [6, 8661.44] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [55, 14341.1] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [6, 21721.6] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [33, 28980.9] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [45, 35093.1] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [45, 33039.3] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [2, 142.141] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [21, 250.736] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [6, 408.125] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [66, 643.002] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [66, 917.088] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [66, 1139.83] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [64, 1242.11] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [46, 308.77] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [22, 587.191] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [46, 1020.52] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [22, 1517.48] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 2000.74] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [74, 2370.92] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [66, 2572.11] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [58, 725.784] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [22, 1304.6] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [13, 2125.58] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [69, 3035.5] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 4017.29] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [63, 4810.84] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [64, 5195.3] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [46, 1486.55] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [44, 2648.78] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [46, 4331.84] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [66, 6314.94] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [74, 8129.97] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [64, 9840.73] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [63, 10545.1] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [58, 2796.69] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [46, 4976.93] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [7, 8191.0] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [13, 12053.7] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [74, 16096.1] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [76, 19227.5] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [66, 20928.2] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [59, 4436.07] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [20, 8234.22] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [12, 14061.6] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [12, 21365.4] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [11, 27006.9] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [12, 34570.0] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [31, 37368.4] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [32, 7932.49] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [38, 14082.2] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [16, 21178.4] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [18, 27895.2] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [38, 33316.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [3, 35361.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [35, 36518.4] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [42, 228.084] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [6, 402.27] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [46, 650.82] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [69, 1008.89] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 1379.02] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [69, 1707.55] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 1885.58] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [34, 496.798] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [46, 947.227] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [7, 1500.65] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [69, 2297.2] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [66, 3037.33] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [22, 3580.67] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [63, 3860.61] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [22, 1047.01] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [22, 1884.52] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [34, 3214.44] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [46, 4711.82] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 6166.97] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [66, 7336.17] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 7785.41] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [46, 2042.35] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [7, 3652.51] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [22, 6018.37] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [69, 9206.45] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [64, 12175.1] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [74, 14334.9] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [66, 15369.6] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [45, 3663.68] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [22, 6545.93] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [57, 10981.1] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [20, 16102.3] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 21679.5] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [57, 26211.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 28380.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [49, 6213.02] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [39, 10702.0] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [39, 18051.4] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [49, 24073.5] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [62, 30606.0] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [26, 34036.7] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [48, 37099.4] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [51, 10422.8] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [42, 16622.1] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [42, 23483.8] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [51, 30116.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [2, 33413.8] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [2, 36691.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [16, 37313.6] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [39, 454.782] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [6, 810.128] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [46, 1319.24] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [66, 2038.54] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [71, 2794.96] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [77, 3451.87] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [66, 3760.23] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [22, 1034.44] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [46, 1958.73] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [58, 3195.66] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [46, 4692.49] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [77, 6136.14] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [63, 7270.21] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [64, 7702.75] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [56, 2101.71] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [46, 3774.12] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [13, 6218.43] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [74, 9186.28] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [69, 12130.3] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [71, 14302.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [77, 15585.0] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [38, 3626.2] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [31, 6595.68] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [19, 10682.7] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [20, 15968.2] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [21, 21041.7] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [58, 26153.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 28365.9] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [51, 5971.27] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [39, 10776.5] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [54, 18203.2] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [60, 24603.0] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [62, 31483.1] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 34001.4] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [59, 37627.6] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [15, 10474.8] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [30, 16803.8] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [27, 24098.0] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [39, 31332.4] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [18, 34262.5] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [3, 37580.7] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [18, 39058.2] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [47, 13920.1] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [16, 21061.5] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [51, 28053.6] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [40, 32705.5] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [16, 37264.6] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [18, 39923.5] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 40574.3] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [46, 963.47] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [22, 1623.18] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [58, 2724.16] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [5, 4143.21] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [69, 5591.48] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [69, 6922.48] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [64, 7706.44] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [42, 1887.63] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [22, 3597.17] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [22, 5963.48] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [46, 8840.97] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [64, 11817.7] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [74, 14597.9] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [64, 15648.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [55, 3489.92] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [45, 6316.72] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [33, 10590.6] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [45, 16378.7] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [46, 21949.0] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [45, 26374.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [43, 28173.3] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [24, 6168.85] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [24, 10743.1] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [42, 17652.5] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [28, 24610.6] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [24, 31603.0] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 34763.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [47, 37426.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [38, 10887.3] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [39, 16999.6] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [41, 24244.5] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [39, 31010.3] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [39, 34354.6] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [18, 37523.8] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 39432.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [47, 13686.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [42, 20792.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [54, 28334.9] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [16, 32858.9] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [54, 37154.2] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [18, 39958.7] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [18, 40498.2] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [54, 16087.5] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [39, 23586.3] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [39, 29628.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [39, 35542.2] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [54, 39207.7] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [16, 40647.0] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 41058.2] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [19, 1678.62] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [55, 2911.7] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [57, 5019.12] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [19, 7567.53] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [6, 10735.7] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [4, 12655.7] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [75, 10298.3] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [55, 3220.61] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [19, 6127.56] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [33, 10076.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [19, 14908.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [20, 21530.0] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 26064.2] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [71, 21717.2] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [24, 5793.24] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [60, 11558.5] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [49, 19328.6] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [26, 26360.3] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [47, 31997.2] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [23, 34932.9] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 36577.8] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [38, 9776.01] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [27, 17366.3] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [39, 24780.2] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [51, 31003.2] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [18, 34266.9] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 37733.5] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 38526.8] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [39, 13986.8] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [42, 21180.0] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [51, 28100.6] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [18, 33081.0] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [18, 37274.5] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 39924.7] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [0, 39642.0] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [39, 16096.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [39, 23578.0] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [16, 29621.6] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [42, 35377.9] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [18, 39180.5] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [18, 40769.0] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [52, 40033.8] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [39, 16953.1] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [51, 24535.6] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [54, 31692.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [39, 36868.0] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [51, 39602.3] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [53, 40759.8] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [59, 38496.4] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [38, 1810.23] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [12, 3632.24] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [1, 5986.52] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [50, 9024.89] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [18, 12521.5] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 15810.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [67, 14132.5] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [39, 5069.41] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [61, 8922.89] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [30, 14676.7] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [16, 21728.7] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [39, 29012.2] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [2, 34178.2] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [72, 28705.1] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [36, 8789.66] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [50, 14625.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [37, 21662.0] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [29, 28523.6] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [2, 33504.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 35874.5] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [14, 36589.4] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [25, 11798.3] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [16, 18593.6] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [39, 25839.7] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [16, 32826.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [39, 35634.0] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 38853.7] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [61, 38321.1] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [42, 14780.9] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [16, 22119.9] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [39, 28047.5] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [42, 34050.3] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [16, 38228.2] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 40709.9] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [42, 39644.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [42, 16864.2] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [39, 23391.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [51, 30651.7] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [39, 36170.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [16, 39707.2] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [18, 40750.7] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [38, 40255.2] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [46, 17374.8] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [42, 24761.4] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [39, 31607.4] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [39, 37076.6] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [51, 39402.8] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [18, 41023.1] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [59, 37774.0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB.yaml +new file mode 100644 +index 00000000..17ead47c +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB.yaml +@@ -0,0 +1,16503 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 36.6584] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [15, 66.2482] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [10, 105.216] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [47, 166.441] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [47, 231.129] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 288.994] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 320.819] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [12, 64.6632] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [28, 118.644] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [53, 203.726] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [47, 323.485] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [47, 461.979] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 577.549] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [47, 646.098] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [41, 147.065] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [28, 268.041] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [20, 452.655] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [53, 708.497] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [47, 981.124] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 1204.74] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 1312.13] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [19, 317.318] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [47, 574.326] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [47, 966.875] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [53, 1487.87] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [53, 2018.92] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 2505.66] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [52, 2672.86] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [40, 655.975] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [55, 1184.0] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [55, 1997.77] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [53, 3070.23] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [53, 4166.19] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [46, 5088.44] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [49, 5478.84] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [38, 1391.61] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [17, 2498.1] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [55, 4133.34] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [53, 6256.67] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [46, 8287.1] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [53, 10379.9] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 10796.2] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [7, 2505.18] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [17, 4460.26] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [38, 7472.32] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [27, 11140.2] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [27, 15017.4] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [9, 18041.2] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 14841.4] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [28, 73.44] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [15, 134.969] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [10, 204.58] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [28, 320.031] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [47, 449.913] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [47, 565.651] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [47, 627.139] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [45, 173.605] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [10, 323.535] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [10, 531.867] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [10, 782.228] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [55, 1023.06] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [47, 1219.1] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [53, 1300.54] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [10, 367.728] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [10, 654.133] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [8, 1063.87] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [53, 1571.05] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [47, 2075.75] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [50, 2502.85] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [50, 2630.79] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [28, 799.524] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [28, 1398.11] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [41, 2267.51] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [47, 3304.57] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [53, 4283.19] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [55, 4984.32] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 5312.82] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [41, 1612.26] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [28, 2817.33] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [28, 4377.62] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [47, 6467.09] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [55, 8650.85] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [50, 10209.4] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 10739.8] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [10, 2976.79] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [26, 5254.37] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [28, 8608.14] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [55, 12586.1] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [50, 16523.2] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [53, 20043.7] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [54, 20504.6] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [20, 4916.4] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [41, 9095.84] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [28, 15075.6] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [41, 22510.4] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [28, 30695.9] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [9, 36539.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [28, 32997.1] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [20, 170.5] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [9, 311.427] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [10, 493.623] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [47, 717.895] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [47, 977.867] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 1172.13] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 1279.22] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [28, 379.988] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [10, 684.229] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [10, 1111.66] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [10, 1621.93] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [53, 2105.84] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [53, 2496.43] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [50, 2621.85] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [32, 754.643] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [28, 1333.44] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [53, 2253.49] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [55, 3297.43] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [50, 4272.82] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [47, 5041.8] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [47, 5292.29] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [26, 1523.26] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [8, 2693.4] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [28, 4398.86] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [47, 6423.75] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [53, 8414.39] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [53, 9963.46] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 10609.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [20, 2974.16] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [10, 5281.67] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [10, 8563.08] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [47, 12304.5] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [47, 16288.6] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [53, 19631.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 20980.1] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [15, 4992.48] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [12, 8894.51] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [15, 14519.5] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [12, 21004.4] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [13, 27105.2] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [39, 34704.0] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [7, 37233.6] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [34, 9151.66] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [22, 15080.6] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [22, 23705.0] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [2, 30326.6] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [0, 34929.8] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [2, 36372.8] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 36913.0] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [25, 250.377] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [7, 453.603] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [4, 746.76] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [47, 1100.39] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [53, 1488.22] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [53, 1797.37] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [47, 1925.06] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [26, 544.998] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [10, 992.66] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [28, 1618.38] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [53, 2448.04] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [55, 3157.97] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [47, 3737.13] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 3965.47] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [20, 1203.19] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [10, 2120.83] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [10, 3282.79] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [53, 4840.06] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [53, 6297.75] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 7403.08] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [55, 7934.69] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [41, 2306.25] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [20, 4076.76] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [41, 6625.2] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [53, 9664.32] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [53, 12355.9] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 14727.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 15674.8] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [34, 3781.49] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [34, 6701.06] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [39, 11046.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [41, 17203.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [40, 22367.2] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [39, 26115.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [39, 28500.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [34, 6486.04] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [56, 11157.5] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [22, 18842.0] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [43, 26955.0] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [13, 32923.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [11, 35412.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [30, 38037.8] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [23, 12973.7] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [25, 20040.5] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [2, 27114.7] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [22, 32410.6] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [21, 34530.7] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [25, 37698.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [1, 37848.7] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [1, 518.071] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [9, 947.937] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [27, 1453.33] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [53, 2206.95] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [53, 2972.93] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [53, 3626.99] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [46, 3875.54] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [20, 1111.37] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [10, 2008.13] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [20, 3269.56] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [53, 4787.11] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [50, 6266.0] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [47, 7419.72] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [50, 7903.38] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [10, 2287.4] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [28, 4036.87] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [8, 6519.66] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [47, 9379.75] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [47, 12341.5] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [53, 14587.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [55, 15924.6] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [26, 3966.89] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [16, 7002.18] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [16, 11369.2] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [41, 16722.9] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [18, 21521.9] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [41, 26239.2] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [40, 28634.7] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [45, 6681.49] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [24, 11541.3] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [34, 19459.4] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [30, 26103.9] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 32141.6] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [13, 34545.5] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [32, 37948.6] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [16, 13085.1] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [6, 19831.2] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [25, 27256.1] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [25, 33344.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [34, 35708.1] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [0, 38321.0] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 39535.7] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [30, 17710.0] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [23, 25159.5] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [22, 32405.4] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [6, 35575.6] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [2, 39104.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [6, 41104.4] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [25, 41108.6] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [1, 1037.85] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [3, 1794.48] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [47, 2883.35] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [53, 4409.26] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [53, 5905.4] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [55, 7219.89] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [53, 7926.88] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [1, 2154.98] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [10, 3805.51] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [10, 6264.83] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [53, 9255.57] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [47, 12215.0] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [46, 14717.9] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [47, 16145.2] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [41, 3954.4] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [20, 7055.18] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [26, 11583.8] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [9, 17219.2] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [10, 22442.0] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [8, 26709.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [41, 27642.9] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [6, 6768.65] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [32, 11558.5] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [35, 19234.4] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [12, 25982.7] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [12, 32229.0] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [29, 34549.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [32, 38137.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [34, 12700.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [25, 19892.0] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [25, 27507.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [22, 33418.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [37, 35893.5] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 38562.8] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 40237.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [32, 18191.6] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [22, 25969.3] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [25, 32777.4] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [22, 35404.3] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [37, 39146.7] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [6, 41178.5] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 41193.0] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [30, 22263.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [34, 30174.9] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [25, 34769.9] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [25, 38601.2] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [34, 41378.2] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [2, 41700.9] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [1, 42008.5] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [9, 1972.24] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [27, 3181.12] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [19, 5311.49] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [41, 7775.64] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [8, 10743.2] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [8, 12693.6] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [3, 11954.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [8, 3746.03] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [4, 6693.06] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [10, 10504.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [27, 15992.3] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [7, 21466.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [26, 27083.7] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [33, 22653.8] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [15, 6472.69] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [0, 11221.0] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [13, 18726.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [32, 25910.8] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [32, 33064.0] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [11, 35196.2] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [30, 37078.1] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [22, 12697.2] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [6, 19957.1] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [2, 27432.4] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [25, 33431.8] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 35728.7] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [6, 38738.6] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [22, 38129.4] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [32, 17600.0] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [25, 25620.6] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [22, 32972.0] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [6, 35763.6] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [34, 39213.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [2, 41225.1] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [22, 40048.3] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [32, 22276.8] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [15, 29866.1] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [22, 34564.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [6, 38560.5] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [37, 41381.6] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [6, 42072.0] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [35, 40767.3] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [32, 25918.3] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [2, 31324.0] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [22, 37431.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [6, 40440.3] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [34, 41824.2] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [2, 42125.6] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [36, 39694.2] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [51, 2225.09] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [48, 4034.46] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [31, 6528.73] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [14, 9786.22] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [1, 13383.6] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 16551.2] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 14478.9] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [44, 5777.28] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [34, 10209.8] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [6, 16418.1] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [22, 23707.1] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [22, 30505.8] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [6, 35357.7] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 29006.0] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [32, 9791.2] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [34, 16029.8] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [21, 23253.2] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [21, 30055.0] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [21, 35151.7] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [23, 36970.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [43, 35837.1] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [2, 15224.4] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [15, 22812.6] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [6, 30100.5] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [2, 35160.9] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [2, 37283.4] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [22, 39988.6] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [45, 38778.3] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [30, 20231.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [22, 28039.5] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [22, 34507.7] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [22, 37051.8] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [34, 40294.0] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [2, 41995.2] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 40689.7] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [30, 23944.0] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [22, 31228.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [25, 35623.4] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [25, 39644.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [34, 42000.7] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [6, 42446.6] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 41119.4] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [30, 24087.9] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [32, 30360.0] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [34, 34619.0] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [25, 40413.4] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [34, 42253.0] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [6, 42182.1] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [44, 38552.0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB_GB.yaml +new file mode 100644 +index 00000000..55146556 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HB_GB.yaml +@@ -0,0 +1,16503 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 36.6584] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [15, 66.2482] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [10, 105.216] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [47, 166.441] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [47, 231.129] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 288.994] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 320.819] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [12, 64.6632] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [28, 118.644] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [53, 203.726] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [47, 323.485] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [47, 461.979] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 577.549] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [47, 646.098] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [41, 147.065] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [28, 268.041] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [20, 452.655] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [53, 708.497] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [47, 981.124] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 1204.74] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 1312.13] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [19, 317.318] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [47, 574.326] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [47, 966.875] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [53, 1487.87] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [53, 2018.92] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [47, 2505.66] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [52, 2672.86] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [40, 655.975] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [55, 1184.0] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [55, 1997.77] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [53, 3070.23] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [53, 4166.19] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [46, 5088.44] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [49, 5478.84] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [38, 1391.61] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [17, 2498.1] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [55, 4133.34] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [53, 6256.67] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [46, 8287.1] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [53, 10379.9] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 10796.2] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [7, 2505.18] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [17, 4460.26] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [38, 7472.32] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [27, 11140.2] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [27, 15017.4] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [9, 18041.2] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 14841.4] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [28, 73.44] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [15, 134.969] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [10, 204.58] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [28, 320.031] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [47, 449.913] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [47, 565.651] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [47, 627.139] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [45, 173.605] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [10, 323.535] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [10, 531.867] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [10, 782.228] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [55, 1023.06] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [47, 1219.1] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [53, 1300.54] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [10, 367.728] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [10, 654.133] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [8, 1063.87] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [53, 1571.05] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [47, 2075.75] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [50, 2502.85] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [50, 2630.79] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [28, 799.524] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [28, 1398.11] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [41, 2267.51] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [47, 3304.57] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [53, 4283.19] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [55, 4984.32] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 5312.82] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [41, 1612.26] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [28, 2817.33] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [28, 4377.62] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [47, 6467.09] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [55, 8650.85] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [50, 10209.4] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [55, 10739.8] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [10, 2976.79] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [26, 5254.37] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [28, 8608.14] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [55, 12586.1] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [50, 16523.2] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [53, 20043.7] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [54, 20504.6] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [20, 4916.4] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [41, 9095.84] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [28, 15075.6] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [41, 22510.4] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [28, 30695.9] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [9, 36539.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [28, 32997.1] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [20, 170.5] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [9, 311.427] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [10, 493.623] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [47, 717.895] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [47, 977.867] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 1172.13] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 1279.22] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [28, 379.988] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [10, 684.229] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [10, 1111.66] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [10, 1621.93] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [53, 2105.84] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [53, 2496.43] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [50, 2621.85] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [32, 754.643] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [28, 1333.44] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [53, 2253.49] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [55, 3297.43] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [50, 4272.82] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [47, 5041.8] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [47, 5292.29] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [26, 1523.26] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [8, 2693.4] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [28, 4398.86] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [47, 6423.75] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [53, 8414.39] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [53, 9963.46] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [55, 10609.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [20, 2974.16] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [10, 5281.67] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [10, 8563.08] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [47, 12304.5] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [47, 16288.6] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [53, 19631.1] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 20980.1] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [15, 4992.48] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [12, 8894.51] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [15, 14519.5] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [12, 21004.4] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [13, 27105.2] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [39, 34704.0] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [7, 37233.6] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [34, 9151.66] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [22, 15080.6] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [22, 23705.0] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [2, 30326.6] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [0, 34929.8] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [2, 36372.8] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [42, 36913.0] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [25, 250.377] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [7, 453.603] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [4, 746.76] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [47, 1100.39] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [53, 1488.22] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [53, 1797.37] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [47, 1925.06] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [26, 544.998] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [10, 992.66] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [28, 1618.38] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [53, 2448.04] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [55, 3157.97] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [47, 3737.13] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 3965.47] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [20, 1203.19] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [10, 2120.83] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [10, 3282.79] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [53, 4840.06] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [53, 6297.75] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 7403.08] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [55, 7934.69] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [41, 2306.25] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [20, 4076.76] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [41, 6625.2] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [53, 9664.32] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [53, 12355.9] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 14727.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [50, 15674.8] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [34, 3781.49] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [34, 6701.06] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [39, 11046.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [41, 17203.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [40, 22367.2] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [39, 26115.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [39, 28500.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [34, 6486.04] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [56, 11157.5] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [22, 18842.0] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [43, 26955.0] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [13, 32923.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [11, 35412.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [30, 38037.8] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [23, 12973.7] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [25, 20040.5] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [2, 27114.7] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [22, 32410.6] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [21, 34530.7] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [25, 37698.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [1, 37848.7] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [1, 518.071] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [9, 947.937] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [27, 1453.33] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [53, 2206.95] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [53, 2972.93] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [53, 3626.99] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [46, 3875.54] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [20, 1111.37] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [10, 2008.13] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [20, 3269.56] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [53, 4787.11] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [50, 6266.0] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [47, 7419.72] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [50, 7903.38] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [10, 2287.4] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [28, 4036.87] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [8, 6519.66] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [47, 9379.75] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [47, 12341.5] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [53, 14587.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [55, 15924.6] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [26, 3966.89] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [16, 7002.18] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [16, 11369.2] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [41, 16722.9] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [18, 21521.9] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [41, 26239.2] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [40, 28634.7] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [45, 6681.49] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [24, 11541.3] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [34, 19459.4] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [30, 26103.9] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [45, 32141.6] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [13, 34545.5] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [32, 37948.6] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [16, 13085.1] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [6, 19831.2] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [25, 27256.1] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [25, 33344.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [34, 35708.1] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [0, 38321.0] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 39535.7] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [30, 17710.0] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [23, 25159.5] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [22, 32405.4] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [6, 35575.6] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [2, 39104.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [6, 41104.4] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [25, 41108.6] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [1, 1037.85] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [3, 1794.48] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [47, 2883.35] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [53, 4409.26] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [53, 5905.4] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [55, 7219.89] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [53, 7926.88] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [1, 2154.98] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [10, 3805.51] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [10, 6264.83] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [53, 9255.57] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [47, 12215.0] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [46, 14717.9] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [47, 16145.2] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [41, 3954.4] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [20, 7055.18] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [26, 11583.8] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [9, 17219.2] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [10, 22442.0] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [8, 26709.1] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [41, 27642.9] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [6, 6768.65] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [32, 11558.5] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [35, 19234.4] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [12, 25982.7] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [12, 32229.0] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [29, 34549.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [32, 38137.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [34, 12700.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [25, 19892.0] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [25, 27507.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [22, 33418.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [37, 35893.5] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 38562.8] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 40237.7] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [32, 18191.6] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [22, 25969.3] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [25, 32777.4] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [22, 35404.3] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [37, 39146.7] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [6, 41178.5] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 41193.0] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [30, 22263.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [34, 30174.9] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [25, 34769.9] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [25, 38601.2] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [34, 41378.2] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [2, 41700.9] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [1, 42008.5] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [9, 1972.24] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [27, 3181.12] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [19, 5311.49] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [41, 7775.64] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [8, 10743.2] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [8, 12693.6] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [3, 11954.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [8, 3746.03] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [4, 6693.06] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [10, 10504.4] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [27, 15992.3] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [7, 21466.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [26, 27083.7] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [33, 22653.8] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [15, 6472.69] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [0, 11221.0] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [13, 18726.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [32, 25910.8] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [32, 33064.0] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [11, 35196.2] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [30, 37078.1] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [22, 12697.2] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [6, 19957.1] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [2, 27432.4] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [25, 33431.8] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 35728.7] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [6, 38738.6] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [22, 38129.4] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [32, 17600.0] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [25, 25620.6] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [22, 32972.0] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [6, 35763.6] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [34, 39213.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [2, 41225.1] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [22, 40048.3] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [32, 22276.8] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [15, 29866.1] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [22, 34564.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [6, 38560.5] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [37, 41381.6] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [6, 42072.0] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [35, 40767.3] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [32, 25918.3] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [2, 31324.0] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [22, 37431.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [6, 40440.3] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [34, 41824.2] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [2, 42125.6] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [36, 39694.2] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [51, 2225.09] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [48, 4034.46] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [31, 6528.73] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [14, 9786.22] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [1, 13383.6] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 16551.2] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 14478.9] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [44, 5777.28] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [34, 10209.8] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [6, 16418.1] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [22, 23707.1] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [22, 30505.8] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [6, 35357.7] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 29006.0] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [32, 9791.2] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [34, 16029.8] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [21, 23253.2] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [21, 30055.0] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [21, 35151.7] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [23, 36970.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [43, 35837.1] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [2, 15224.4] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [15, 22812.6] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [6, 30100.5] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [2, 35160.9] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [2, 37283.4] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [22, 39988.6] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [45, 38778.3] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [30, 20231.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [22, 28039.5] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [22, 34507.7] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [22, 37051.8] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [34, 40294.0] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [2, 41995.2] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 40689.7] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [30, 23944.0] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [22, 31228.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [25, 35623.4] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [25, 39644.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [34, 42000.7] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [6, 42446.6] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 41119.4] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [30, 24087.9] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [32, 30360.0] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [34, 34619.0] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [25, 40413.4] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [34, 42253.0] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [6, 42182.1] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [44, 38552.0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH.yaml +new file mode 100644 +index 00000000..c126ce32 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH.yaml +@@ -0,0 +1,17313 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [4, 35.3724] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [19, 59.1948] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [46, 102.141] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [9, 158.587] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 223.864] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [54, 281.195] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 309.224] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [2, 63.0231] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [19, 114.237] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [46, 197.807] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [51, 312.682] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [54, 440.624] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 557.161] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 619.337] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [9, 140.09] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [46, 254.695] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [18, 434.285] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [45, 697.773] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [46, 946.208] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 1151.45] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 1270.91] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [47, 299.166] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [11, 568.105] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [46, 952.169] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [46, 1458.89] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [54, 1940.35] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [46, 2353.13] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 2564.69] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [46, 617.445] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [9, 1113.58] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [46, 1884.66] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [50, 2876.02] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 3952.46] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [48, 4816.71] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 5224.21] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [18, 1235.8] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [18, 2225.69] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [53, 3744.09] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [53, 5785.25] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [53, 7990.58] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [54, 10169.9] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 10509.8] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [8, 2249.57] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [8, 4087.02] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [18, 6900.65] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [6, 10499.7] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [16, 14324.2] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [9, 17619.1] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 14861.1] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [44, 73.5843] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [2, 127.751] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [18, 203.173] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [7, 315.884] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 440.186] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [54, 553.247] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [54, 619.863] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [37, 169.207] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [37, 295.041] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [18, 487.993] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [9, 734.361] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 978.208] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 1184.71] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [46, 1277.36] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [34, 356.962] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [16, 627.61] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [9, 1035.25] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [18, 1535.81] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 2030.77] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 2461.81] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 2584.59] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [11, 739.867] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [18, 1309.9] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [9, 2145.15] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [46, 3157.48] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [56, 4113.59] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 4952.51] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [54, 5203.55] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [18, 1494.76] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [16, 2639.17] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [18, 4302.97] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [59, 6406.57] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [46, 8431.3] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [46, 9816.64] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [46, 10526.7] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [18, 2796.67] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [18, 4985.8] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [18, 8513.11] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [46, 12392.0] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [51, 16474.5] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [48, 19672.1] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [53, 20650.1] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [15, 5112.67] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [18, 9081.04] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [8, 14966.3] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [8, 22407.0] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [15, 29438.6] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [17, 35692.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [18, 33090.7] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [55, 158.803] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [18, 280.894] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [9, 447.346] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [50, 687.704] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [54, 927.328] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 1154.54] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 1250.8] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [11, 345.779] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [9, 629.397] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [9, 1039.09] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [51, 1534.55] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [51, 2066.04] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 2430.25] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 2579.62] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [34, 741.174] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [9, 1316.89] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [9, 2130.98] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [51, 3239.47] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 4199.03] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [51, 4865.96] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 5175.46] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [41, 1496.9] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [52, 2752.62] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [18, 4514.86] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [54, 6514.79] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [18, 8293.24] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 9786.57] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 10362.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [18, 2958.96] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [18, 5225.73] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [18, 8489.45] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [18, 12370.3] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [18, 15773.6] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 19280.0] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 20919.2] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [7, 5002.9] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [10, 8749.55] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [16, 14328.8] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [16, 21569.7] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [16, 29184.2] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 34958.2] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [16, 36146.9] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [14, 8672.66] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [56, 15418.5] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [34, 23770.1] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [32, 31115.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [41, 37117.8] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [28, 39090.7] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [32, 40480.0] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [7, 235.847] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [18, 416.82] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [51, 667.743] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [46, 1032.23] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [51, 1436.41] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [54, 1731.58] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [49, 1889.15] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [34, 528.96] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [9, 963.175] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [46, 1629.07] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [54, 2392.42] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [54, 3126.97] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [49, 3645.11] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [49, 3860.23] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [21, 1173.13] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [18, 2074.33] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [7, 3346.53] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [51, 4865.31] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [54, 6195.06] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [51, 7306.1] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [54, 7816.39] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [7, 2269.24] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [18, 4000.31] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [9, 6496.92] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [46, 9208.13] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [49, 12101.9] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [54, 14474.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [54, 15549.2] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [12, 3905.92] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [32, 6686.81] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [17, 11179.9] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [16, 16662.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [18, 22290.4] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [17, 26651.7] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [18, 28790.8] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [34, 6469.36] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [35, 12217.9] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [37, 18670.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [20, 27522.4] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [22, 34049.6] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [26, 36463.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [39, 36406.3] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [16, 12657.3] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [3, 19315.6] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [14, 26391.4] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [4, 31297.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [14, 33942.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [13, 36838.9] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [5, 37450.5] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [16, 485.752] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [9, 820.161] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [9, 1376.23] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [46, 2110.52] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [54, 2862.03] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [49, 3506.33] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 3781.92] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [18, 1078.23] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [18, 1967.0] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [7, 3195.66] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [58, 4725.98] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [49, 6249.28] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [54, 7295.51] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 7735.16] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [18, 2141.41] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [9, 3815.89] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [18, 6442.04] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [18, 9399.0] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [9, 12125.2] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [54, 14413.4] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [54, 15420.0] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [37, 3905.31] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [34, 6931.78] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [18, 11536.0] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [17, 16956.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 21543.8] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 26583.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [18, 28557.0] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [34, 6702.84] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [34, 11594.5] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [23, 19451.8] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [30, 26711.8] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [38, 32086.5] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [26, 35618.1] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [30, 39787.9] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [14, 12965.4] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [44, 20042.6] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [32, 27501.8] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [28, 33332.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [44, 36359.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [21, 40073.0] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [44, 41485.0] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [34, 17333.4] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [34, 25453.9] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [21, 32512.7] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [34, 35809.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [44, 40149.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [41, 42542.7] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [34, 42891.8] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [0, 975.272] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [7, 1632.24] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [54, 2743.77] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [54, 4194.31] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [59, 5595.21] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [54, 7021.23] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [48, 7699.95] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [12, 2143.61] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [9, 3924.82] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [18, 6387.27] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [9, 9313.78] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [54, 11926.9] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [46, 14667.0] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [46, 15682.7] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [18, 3721.1] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [6, 6660.29] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [16, 11092.4] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [15, 16444.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [44, 21416.6] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [44, 25685.2] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [28, 27191.2] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [44, 6457.74] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [58, 12238.7] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [26, 19137.5] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [43, 27369.1] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [38, 33599.3] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [30, 36802.3] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [29, 40257.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [13, 12478.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [41, 19616.7] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [28, 27098.3] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [44, 34226.2] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [21, 36458.2] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 40071.5] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [28, 41980.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [44, 17843.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [44, 25563.7] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [41, 32223.9] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [41, 36222.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [41, 40045.6] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 42563.0] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [44, 43059.2] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [41, 21679.5] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [34, 29599.3] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [34, 33815.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [44, 38645.2] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [41, 41823.0] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [44, 42770.2] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [36, 43247.3] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [15, 1879.73] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [31, 3021.83] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [15, 5104.11] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [8, 7937.49] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [15, 10948.2] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [7, 12625.9] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 12220.3] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [44, 3658.89] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [16, 6709.1] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [15, 10666.9] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [16, 16065.0] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [6, 21593.5] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 25465.9] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [27, 22539.3] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [24, 6864.65] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [60, 11658.9] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [41, 18509.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [38, 26204.2] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [26, 34467.8] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [42, 37075.9] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [42, 35658.3] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [4, 12361.9] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [5, 19468.8] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [28, 27140.3] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [37, 33603.5] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 36666.5] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 39971.6] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [32, 40473.6] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [34, 17348.3] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [34, 25067.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [44, 32251.0] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [37, 35884.7] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 40045.1] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 42618.2] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [44, 42768.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [44, 21665.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [37, 29264.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [34, 34128.2] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [41, 38685.1] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [44, 41801.3] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 42907.2] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [41, 42739.0] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [34, 24917.5] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [41, 30784.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [41, 36354.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [41, 40252.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [37, 41771.8] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [40, 42839.0] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [44, 41231.2] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [1, 2395.03] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [24, 4100.52] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [33, 6805.48] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [33, 10446.6] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [27, 13849.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [40, 16299.5] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 14729.2] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [37, 5800.27] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [25, 10186.6] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [34, 16292.6] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [37, 22693.0] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [37, 29239.8] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [25, 34083.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [59, 28949.3] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [37, 9727.33] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [41, 16118.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [34, 23625.8] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [37, 30986.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [28, 37067.8] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [32, 38750.7] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [28, 37900.3] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [41, 14441.3] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [44, 21972.3] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [34, 29578.1] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [28, 36156.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [41, 38337.7] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [41, 41676.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [28, 41748.0] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [34, 19134.4] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [41, 27035.5] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [37, 33723.1] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [41, 37471.4] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [41, 40968.1] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [28, 43023.1] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [34, 42660.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [34, 23126.1] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [44, 30466.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [41, 35304.8] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [44, 39559.8] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [41, 42293.5] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [44, 42711.1] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [44, 43141.1] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [44, 23646.6] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [37, 30074.7] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [13, 34815.7] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [13, 39747.8] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [44, 41444.8] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [37, 42986.9] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [60, 40108.5] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH_GB.yaml +new file mode 100644 +index 00000000..2bc6d0e0 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_HHS_BH_GB.yaml +@@ -0,0 +1,17313 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 1 ++ LVCA: 4 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4352 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25088 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 4 ++ LSPB: 1 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8704 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [4, 35.3724] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [19, 59.1948] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [46, 102.141] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [9, 158.587] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 223.864] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [54, 281.195] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 309.224] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [2, 63.0231] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [19, 114.237] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [46, 197.807] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [51, 312.682] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [54, 440.624] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 557.161] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 619.337] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [9, 140.09] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [46, 254.695] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [18, 434.285] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [45, 697.773] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [46, 946.208] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 1151.45] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 1270.91] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [47, 299.166] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [11, 568.105] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [46, 952.169] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [46, 1458.89] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [54, 1940.35] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [46, 2353.13] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [54, 2564.69] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [46, 617.445] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [9, 1113.58] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [46, 1884.66] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [50, 2876.02] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [49, 3952.46] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [48, 4816.71] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 5224.21] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [18, 1235.8] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [18, 2225.69] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [53, 3744.09] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [53, 5785.25] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [53, 7990.58] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [54, 10169.9] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [48, 10509.8] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [8, 2249.57] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [8, 4087.02] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [18, 6900.65] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [6, 10499.7] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [16, 14324.2] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [9, 17619.1] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [53, 14861.1] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [44, 73.5843] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [2, 127.751] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [18, 203.173] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [7, 315.884] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 440.186] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [54, 553.247] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [54, 619.863] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [37, 169.207] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [37, 295.041] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [18, 487.993] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [9, 734.361] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 978.208] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 1184.71] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [46, 1277.36] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [34, 356.962] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [16, 627.61] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [9, 1035.25] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [18, 1535.81] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [49, 2030.77] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [49, 2461.81] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 2584.59] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [11, 739.867] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [18, 1309.9] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [9, 2145.15] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [46, 3157.48] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [56, 4113.59] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 4952.51] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [54, 5203.55] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [18, 1494.76] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [16, 2639.17] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [18, 4302.97] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [59, 6406.57] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [46, 8431.3] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [46, 9816.64] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [46, 10526.7] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [18, 2796.67] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [18, 4985.8] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [18, 8513.11] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [46, 12392.0] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [51, 16474.5] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [48, 19672.1] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [53, 20650.1] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [15, 5112.67] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [18, 9081.04] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [8, 14966.3] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [8, 22407.0] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [15, 29438.6] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [17, 35692.6] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [18, 33090.7] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [55, 158.803] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [18, 280.894] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [9, 447.346] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [50, 687.704] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [54, 927.328] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 1154.54] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 1250.8] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [11, 345.779] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [9, 629.397] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [9, 1039.09] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [51, 1534.55] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [51, 2066.04] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 2430.25] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 2579.62] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [34, 741.174] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [9, 1316.89] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [9, 2130.98] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [51, 3239.47] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [46, 4199.03] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [51, 4865.96] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 5175.46] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [41, 1496.9] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [52, 2752.62] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [18, 4514.86] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [54, 6514.79] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [18, 8293.24] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [49, 9786.57] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [49, 10362.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [18, 2958.96] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [18, 5225.73] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [18, 8489.45] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [18, 12370.3] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [18, 15773.6] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [54, 19280.0] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [54, 20919.2] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [7, 5002.9] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [10, 8749.55] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [16, 14328.8] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [16, 21569.7] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [16, 29184.2] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 34958.2] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [16, 36146.9] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [14, 8672.66] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [56, 15418.5] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [34, 23770.1] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [32, 31115.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [41, 37117.8] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [28, 39090.7] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [32, 40480.0] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [7, 235.847] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [18, 416.82] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [51, 667.743] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [46, 1032.23] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [51, 1436.41] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [54, 1731.58] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [49, 1889.15] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [34, 528.96] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [9, 963.175] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [46, 1629.07] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [54, 2392.42] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [54, 3126.97] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [49, 3645.11] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [49, 3860.23] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [21, 1173.13] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [18, 2074.33] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [7, 3346.53] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [51, 4865.31] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [54, 6195.06] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [51, 7306.1] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [54, 7816.39] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [7, 2269.24] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [18, 4000.31] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [9, 6496.92] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [46, 9208.13] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [49, 12101.9] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [54, 14474.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [54, 15549.2] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [12, 3905.92] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [32, 6686.81] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [17, 11179.9] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [16, 16662.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [18, 22290.4] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [17, 26651.7] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [18, 28790.8] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [34, 6469.36] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [35, 12217.9] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [37, 18670.8] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [20, 27522.4] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [22, 34049.6] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [26, 36463.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [39, 36406.3] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [16, 12657.3] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [3, 19315.6] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [14, 26391.4] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [4, 31297.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [14, 33942.0] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [13, 36838.9] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [5, 37450.5] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [16, 485.752] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [9, 820.161] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [9, 1376.23] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [46, 2110.52] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [54, 2862.03] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [49, 3506.33] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 3781.92] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [18, 1078.23] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [18, 1967.0] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [7, 3195.66] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [58, 4725.98] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [49, 6249.28] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [54, 7295.51] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [49, 7735.16] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [18, 2141.41] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [9, 3815.89] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [18, 6442.04] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [18, 9399.0] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [9, 12125.2] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [54, 14413.4] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [54, 15420.0] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [37, 3905.31] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [34, 6931.78] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [18, 11536.0] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [17, 16956.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 21543.8] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 26583.1] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [18, 28557.0] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [34, 6702.84] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [34, 11594.5] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [23, 19451.8] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [30, 26711.8] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [38, 32086.5] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [26, 35618.1] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [30, 39787.9] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [14, 12965.4] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [44, 20042.6] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [32, 27501.8] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [28, 33332.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [44, 36359.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [21, 40073.0] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [44, 41485.0] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [34, 17333.4] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [34, 25453.9] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [21, 32512.7] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [34, 35809.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [44, 40149.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [41, 42542.7] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [34, 42891.8] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [0, 975.272] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [7, 1632.24] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [54, 2743.77] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [54, 4194.31] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [59, 5595.21] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [54, 7021.23] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [48, 7699.95] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [12, 2143.61] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [9, 3924.82] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [18, 6387.27] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [9, 9313.78] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [54, 11926.9] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [46, 14667.0] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [46, 15682.7] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [18, 3721.1] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [6, 6660.29] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [16, 11092.4] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [15, 16444.3] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [44, 21416.6] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [44, 25685.2] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [28, 27191.2] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [44, 6457.74] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [58, 12238.7] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [26, 19137.5] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [43, 27369.1] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [38, 33599.3] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [30, 36802.3] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [29, 40257.3] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [13, 12478.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [41, 19616.7] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [28, 27098.3] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [44, 34226.2] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [21, 36458.2] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 40071.5] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [28, 41980.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [44, 17843.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [44, 25563.7] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [41, 32223.9] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [41, 36222.1] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [41, 40045.6] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 42563.0] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [44, 43059.2] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [41, 21679.5] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [34, 29599.3] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [34, 33815.1] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [44, 38645.2] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [41, 41823.0] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [44, 42770.2] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [36, 43247.3] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [15, 1879.73] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [31, 3021.83] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [15, 5104.11] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [8, 7937.49] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [15, 10948.2] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [7, 12625.9] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [9, 12220.3] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [44, 3658.89] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [16, 6709.1] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [15, 10666.9] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [16, 16065.0] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [6, 21593.5] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 25465.9] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [27, 22539.3] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [24, 6864.65] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [60, 11658.9] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [41, 18509.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [38, 26204.2] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [26, 34467.8] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [42, 37075.9] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [42, 35658.3] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [4, 12361.9] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [5, 19468.8] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [28, 27140.3] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [37, 33603.5] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 36666.5] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 39971.6] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [32, 40473.6] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [34, 17348.3] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [34, 25067.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [44, 32251.0] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [37, 35884.7] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 40045.1] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 42618.2] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [44, 42768.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [44, 21665.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [37, 29264.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [34, 34128.2] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [41, 38685.1] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [44, 41801.3] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [44, 42907.2] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [41, 42739.0] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [34, 24917.5] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [41, 30784.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [41, 36354.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [41, 40252.5] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [37, 41771.8] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [40, 42839.0] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [44, 41231.2] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [1, 2395.03] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [24, 4100.52] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [33, 6805.48] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [33, 10446.6] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [27, 13849.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [40, 16299.5] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [57, 14729.2] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [37, 5800.27] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [25, 10186.6] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [34, 16292.6] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [37, 22693.0] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [37, 29239.8] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [25, 34083.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [59, 28949.3] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [37, 9727.33] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [41, 16118.4] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [34, 23625.8] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [37, 30986.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [28, 37067.8] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [32, 38750.7] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [28, 37900.3] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [41, 14441.3] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [44, 21972.3] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [34, 29578.1] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [28, 36156.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [41, 38337.7] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [41, 41676.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [28, 41748.0] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [34, 19134.4] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [41, 27035.5] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [37, 33723.1] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [41, 37471.4] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [41, 40968.1] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [28, 43023.1] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [34, 42660.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [34, 23126.1] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [44, 30466.0] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [41, 35304.8] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [44, 39559.8] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [41, 42293.5] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [44, 42711.1] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [44, 43141.1] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [44, 23646.6] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [37, 30074.7] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [13, 34815.7] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [13, 39747.8] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [44, 41444.8] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [37, 42986.9] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [60, 40108.5] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH.yaml +new file mode 100644 +index 00000000..1e9dd2f6 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH.yaml +@@ -0,0 +1,26493 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8448 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 36.4343] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [7, 61.0845] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [84, 105.068] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [39, 168.473] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 233.458] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 295.885] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [84, 327.309] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [16, 64.251] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [16, 117.95] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [39, 204.6] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [39, 335.303] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [38, 472.811] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 595.254] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [65, 673.054] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [23, 152.078] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [39, 278.581] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [38, 457.943] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [66, 713.865] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [48, 993.322] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [85, 1234.03] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [36, 1328.6] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [18, 332.409] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [38, 601.852] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [83, 1013.61] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [36, 1501.45] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [65, 2095.58] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [36, 2525.27] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [36, 2673.82] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [9, 677.377] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [38, 1240.92] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [36, 2085.68] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [83, 3155.39] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [73, 4164.9] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [83, 5128.3] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [73, 5361.96] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [73, 1288.37] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [40, 2315.38] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [73, 3955.96] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [36, 6077.05] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [73, 8239.27] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [36, 10096.1] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [65, 11119.3] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [27, 2196.84] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [25, 3963.43] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [9, 6707.53] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [20, 9958.28] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [31, 13923.7] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [15, 16947.8] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [6, 18577.2] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [45, 65.8986] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [6, 122.084] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [6, 203.924] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [87, 323.71] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 466.786] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [40, 585.368] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [58, 666.477] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [28, 181.857] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [8, 315.456] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [39, 517.304] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [77, 802.816] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [59, 1046.42] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 1282.07] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [51, 1338.97] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [39, 392.138] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [39, 691.901] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [85, 1136.05] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [69, 1683.28] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [88, 2160.48] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [69, 2618.27] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [51, 2759.92] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [39, 799.376] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [38, 1419.4] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [58, 2233.09] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [51, 3434.44] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 4356.59] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [59, 5244.93] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [58, 5382.06] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [39, 1395.54] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [74, 2626.36] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [58, 4353.2] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [41, 6514.17] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [88, 8419.13] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [58, 10179.2] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 11117.0] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [39, 2496.98] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [57, 4718.67] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [39, 7982.5] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [41, 11752.9] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [88, 16233.4] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [36, 19544.6] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 21654.6] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [38, 5001.44] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [27, 8921.68] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [79, 15829.4] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [24, 23407.3] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [21, 31119.4] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [79, 35971.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [5, 37787.9] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [1, 155.115] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [37, 276.305] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [36, 470.69] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [87, 736.618] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [76, 1003.78] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [58, 1221.32] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [41, 1317.31] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [38, 388.006] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [27, 685.68] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [76, 1130.84] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [51, 1695.7] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [88, 2208.55] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [58, 2580.72] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [87, 2733.01] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [58, 791.83] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [6, 1378.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [41, 2204.05] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [50, 3314.35] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [50, 4472.43] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [69, 5192.58] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [68, 5359.49] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [17, 1435.18] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [67, 2498.84] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [77, 4343.62] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [50, 6528.73] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [87, 8487.27] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [69, 10229.6] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [40, 11111.4] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [57, 2614.09] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [44, 4717.36] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [58, 7728.76] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [50, 12248.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 16189.3] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [58, 19777.2] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [40, 21984.4] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [60, 5004.41] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [15, 8882.71] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [6, 14686.3] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [21, 23356.5] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [78, 30780.4] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 35913.5] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [31, 37662.0] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [50, 9097.04] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [85, 15353.2] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [63, 22876.8] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [81, 30093.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [55, 35581.5] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [63, 37636.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [93, 40145.2] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [0, 237.557] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [67, 427.525] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [68, 727.59] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [68, 1096.26] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [87, 1486.9] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [76, 1850.63] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [59, 1979.19] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [41, 594.993] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [39, 1042.32] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [6, 1692.39] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [51, 2483.31] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [88, 3348.08] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [76, 3905.62] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [40, 4033.15] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [76, 1156.52] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [39, 2067.52] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [41, 3255.18] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [51, 4899.89] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [50, 6593.95] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [88, 7707.46] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [88, 8029.62] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [86, 2072.28] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [85, 3562.05] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [51, 6016.21] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [88, 9195.51] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [51, 12683.6] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [87, 15155.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [88, 16740.3] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [30, 3623.07] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [28, 6477.69] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [90, 11132.9] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [32, 17228.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [26, 22142.1] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [34, 26662.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [90, 28382.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [77, 8056.93] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [75, 13508.3] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [81, 19806.0] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [81, 28038.0] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [81, 33376.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [93, 36319.3] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [93, 38891.5] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [74, 11956.7] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [73, 19519.7] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [85, 26332.7] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [13, 31788.8] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [74, 34219.6] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [73, 37183.3] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 38906.3] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [3, 518.842] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [36, 858.785] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [36, 1516.56] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [76, 2285.93] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [59, 3034.95] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [40, 3732.01] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [58, 3974.32] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [76, 1141.62] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [38, 2028.19] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [38, 3342.07] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [50, 4851.24] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [77, 6556.16] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 7680.13] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 7994.71] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [85, 2034.42] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [40, 3648.81] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [50, 5934.64] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [50, 9104.88] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [50, 12528.1] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [76, 14913.6] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [76, 16691.3] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [19, 3772.42] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [85, 6467.7] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [20, 10793.8] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [70, 17176.6] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [15, 22504.6] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [31, 27179.7] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [31, 28915.4] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [84, 8033.78] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [29, 13200.0] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [81, 19763.1] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [82, 27977.6] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [93, 33266.1] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [93, 35939.2] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [93, 39032.4] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [73, 12340.7] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [13, 19012.8] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [94, 26329.3] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [81, 32810.8] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [93, 35496.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [46, 38748.8] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [94, 40602.6] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [92, 16915.4] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [83, 24720.8] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [63, 31396.0] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [46, 35174.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [63, 39080.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [81, 41196.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [93, 41732.1] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [0, 961.555] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [85, 1732.47] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [87, 2935.82] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [68, 4522.97] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [58, 6027.74] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [58, 7438.1] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [50, 7991.38] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [41, 1995.39] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [38, 3640.9] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [87, 6077.99] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [50, 9274.33] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [58, 12202.4] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 14979.7] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [41, 16861.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [36, 3744.36] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [60, 6428.89] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [89, 11314.4] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [32, 16829.1] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [42, 22465.7] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [4, 26640.2] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [52, 28402.9] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [87, 7699.5] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [75, 12940.4] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [72, 20375.1] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [93, 27291.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [63, 33830.8] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 36211.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [93, 38960.7] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [83, 11834.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [57, 19533.0] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [22, 25987.7] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [81, 33271.7] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [63, 35717.6] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 38785.7] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [94, 40624.1] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [92, 17182.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [75, 25107.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [63, 31219.5] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [81, 35398.9] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [81, 39111.6] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [81, 41169.6] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [93, 41751.3] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [91, 22034.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [85, 29172.5] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [93, 33740.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [93, 38246.9] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [63, 40816.9] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [94, 41277.3] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [46, 41750.4] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [35, 1578.0] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [4, 2931.37] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [4, 4832.16] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [27, 7409.34] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [11, 10308.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [78, 12814.4] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [42, 13927.1] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [36, 3979.43] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [43, 6292.26] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [31, 11131.6] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [6, 16619.3] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [60, 22457.0] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [4, 26709.1] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [4, 28908.6] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [58, 7641.06] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [61, 13427.1] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [63, 20288.9] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [64, 27274.5] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [63, 33859.2] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [47, 36139.2] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [72, 38961.1] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [65, 11867.9] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [83, 19514.1] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [63, 25869.2] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [63, 33237.3] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [63, 35725.5] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [64, 38951.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [72, 40659.2] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [80, 17518.9] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [57, 24552.1] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [81, 31895.9] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [81, 35489.4] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [81, 39129.6] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [81, 41200.3] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [93, 41732.1] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [80, 21727.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [75, 29237.1] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [93, 33840.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [81, 38302.4] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [63, 40853.4] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [81, 41506.4] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [63, 41389.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [45, 6763.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [53, 12835.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [45, 23148.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [92, 35655.0] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [93, 39466.7] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [64, 41295.0] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [81, 41849.2] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [78, 1791.1] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [10, 3576.09] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [14, 5959.42] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [14, 9558.99] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 13081.7] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [12, 16531.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [11, 18058.3] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [68, 5773.3] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [42, 10035.7] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [12, 16141.6] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [24, 23615.3] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [3, 30533.6] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [42, 35860.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [0, 37730.8] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [68, 9804.11] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [62, 15429.1] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [93, 22639.5] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [63, 30681.9] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [81, 35673.7] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [63, 37833.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [46, 40145.6] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [71, 13770.2] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [75, 21377.4] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [63, 28915.4] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [81, 35019.8] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [63, 37461.6] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [81, 40218.5] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [93, 41715.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [80, 18825.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [75, 27206.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [81, 33398.9] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [63, 36738.7] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [81, 39857.6] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [81, 41701.6] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [63, 41326.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [91, 23035.8] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [85, 30768.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [35, 34131.5] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [88, 37340.4] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [94, 39941.3] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [93, 40407.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [81, 41869.1] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [45, 7040.61] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [53, 13693.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [71, 24216.1] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [54, 36638.2] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [64, 38693.3] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [81, 41280.4] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [93, 41825.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH_GB.yaml +new file mode 100644 +index 00000000..3c182a53 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_I8II_BH_GB.yaml +@@ -0,0 +1,26493 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12544 ++ LdsNumElementsAlignedA: 2304 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2304 ++ LdsOffsetB_Blk: 10496 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 2 ++ LVCA: 2 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 4608 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 16 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 16 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 1 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 12800 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 2048 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 8192 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 10752 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 16 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 4 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 8448 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 2 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 9216 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 2 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4096 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: true ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: false ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 36.4343] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [7, 61.0845] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [84, 105.068] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [39, 168.473] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 233.458] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [56, 295.885] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [84, 327.309] ++ - - [64, 128, 1, 64, 96, 96, 96, 128] ++ - [16, 64.251] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [16, 117.95] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [39, 204.6] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [39, 335.303] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [38, 472.811] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [49, 595.254] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [65, 673.054] ++ - - [64, 256, 1, 64, 96, 96, 96, 256] ++ - [23, 152.078] ++ - - [64, 256, 1, 128, 96, 96, 160, 256] ++ - [39, 278.581] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [38, 457.943] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [66, 713.865] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [48, 993.322] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [85, 1234.03] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [36, 1328.6] ++ - - [64, 512, 1, 64, 96, 96, 96, 512] ++ - [18, 332.409] ++ - - [64, 512, 1, 128, 96, 96, 160, 512] ++ - [38, 601.852] ++ - - [64, 512, 1, 256, 96, 96, 288, 512] ++ - [83, 1013.61] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [36, 1501.45] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [65, 2095.58] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [36, 2525.27] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [36, 2673.82] ++ - - [64, 1024, 1, 64, 96, 96, 96, 1024] ++ - [9, 677.377] ++ - - [64, 1024, 1, 128, 96, 96, 160, 1024] ++ - [38, 1240.92] ++ - - [64, 1024, 1, 256, 96, 96, 288, 1024] ++ - [36, 2085.68] ++ - - [64, 1024, 1, 512, 96, 96, 544, 1024] ++ - [83, 3155.39] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [73, 4164.9] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [83, 5128.3] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [73, 5361.96] ++ - - [64, 2048, 1, 64, 96, 96, 96, 2048] ++ - [73, 1288.37] ++ - - [64, 2048, 1, 128, 96, 96, 160, 2048] ++ - [40, 2315.38] ++ - - [64, 2048, 1, 256, 96, 96, 288, 2048] ++ - [73, 3955.96] ++ - - [64, 2048, 1, 512, 96, 96, 544, 2048] ++ - [36, 6077.05] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] ++ - [73, 8239.27] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [36, 10096.1] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [65, 11119.3] ++ - - [64, 4096, 1, 64, 96, 96, 96, 4096] ++ - [27, 2196.84] ++ - - [64, 4096, 1, 128, 96, 96, 160, 4096] ++ - [25, 3963.43] ++ - - [64, 4096, 1, 256, 96, 96, 288, 4096] ++ - [9, 6707.53] ++ - - [64, 4096, 1, 512, 96, 96, 544, 4096] ++ - [20, 9958.28] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] ++ - [31, 13923.7] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] ++ - [15, 16947.8] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [6, 18577.2] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [45, 65.8986] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [6, 122.084] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [6, 203.924] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [87, 323.71] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [41, 466.786] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [40, 585.368] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [58, 666.477] ++ - - [128, 128, 1, 64, 160, 160, 96, 128] ++ - [28, 181.857] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [8, 315.456] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [39, 517.304] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [77, 802.816] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [59, 1046.42] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 1282.07] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [51, 1338.97] ++ - - [128, 256, 1, 64, 160, 160, 96, 256] ++ - [39, 392.138] ++ - - [128, 256, 1, 128, 160, 160, 160, 256] ++ - [39, 691.901] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [85, 1136.05] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [69, 1683.28] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [88, 2160.48] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [69, 2618.27] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [51, 2759.92] ++ - - [128, 512, 1, 64, 160, 160, 96, 512] ++ - [39, 799.376] ++ - - [128, 512, 1, 128, 160, 160, 160, 512] ++ - [38, 1419.4] ++ - - [128, 512, 1, 256, 160, 160, 288, 512] ++ - [58, 2233.09] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [51, 3434.44] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 4356.59] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [59, 5244.93] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [58, 5382.06] ++ - - [128, 1024, 1, 64, 160, 160, 96, 1024] ++ - [39, 1395.54] ++ - - [128, 1024, 1, 128, 160, 160, 160, 1024] ++ - [74, 2626.36] ++ - - [128, 1024, 1, 256, 160, 160, 288, 1024] ++ - [58, 4353.2] ++ - - [128, 1024, 1, 512, 160, 160, 544, 1024] ++ - [41, 6514.17] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [88, 8419.13] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [58, 10179.2] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 11117.0] ++ - - [128, 2048, 1, 64, 160, 160, 96, 2048] ++ - [39, 2496.98] ++ - - [128, 2048, 1, 128, 160, 160, 160, 2048] ++ - [57, 4718.67] ++ - - [128, 2048, 1, 256, 160, 160, 288, 2048] ++ - [39, 7982.5] ++ - - [128, 2048, 1, 512, 160, 160, 544, 2048] ++ - [41, 11752.9] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] ++ - [88, 16233.4] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [36, 19544.6] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [59, 21654.6] ++ - - [128, 4096, 1, 64, 160, 160, 96, 4096] ++ - [38, 5001.44] ++ - - [128, 4096, 1, 128, 160, 160, 160, 4096] ++ - [27, 8921.68] ++ - - [128, 4096, 1, 256, 160, 160, 288, 4096] ++ - [79, 15829.4] ++ - - [128, 4096, 1, 512, 160, 160, 544, 4096] ++ - [24, 23407.3] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] ++ - [21, 31119.4] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] ++ - [79, 35971.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [5, 37787.9] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [1, 155.115] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [37, 276.305] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [36, 470.69] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [87, 736.618] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [76, 1003.78] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [58, 1221.32] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [41, 1317.31] ++ - - [256, 128, 1, 64, 288, 288, 96, 128] ++ - [38, 388.006] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [27, 685.68] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [76, 1130.84] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [51, 1695.7] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [88, 2208.55] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [58, 2580.72] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [87, 2733.01] ++ - - [256, 256, 1, 64, 288, 288, 96, 256] ++ - [58, 791.83] ++ - - [256, 256, 1, 128, 288, 288, 160, 256] ++ - [6, 1378.57] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [41, 2204.05] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [50, 3314.35] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [50, 4472.43] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [69, 5192.58] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [68, 5359.49] ++ - - [256, 512, 1, 64, 288, 288, 96, 512] ++ - [17, 1435.18] ++ - - [256, 512, 1, 128, 288, 288, 160, 512] ++ - [67, 2498.84] ++ - - [256, 512, 1, 256, 288, 288, 288, 512] ++ - [77, 4343.62] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [50, 6528.73] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [87, 8487.27] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [69, 10229.6] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [40, 11111.4] ++ - - [256, 1024, 1, 64, 288, 288, 96, 1024] ++ - [57, 2614.09] ++ - - [256, 1024, 1, 128, 288, 288, 160, 1024] ++ - [44, 4717.36] ++ - - [256, 1024, 1, 256, 288, 288, 288, 1024] ++ - [58, 7728.76] ++ - - [256, 1024, 1, 512, 288, 288, 544, 1024] ++ - [50, 12248.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [69, 16189.3] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [58, 19777.2] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [40, 21984.4] ++ - - [256, 2048, 1, 64, 288, 288, 96, 2048] ++ - [60, 5004.41] ++ - - [256, 2048, 1, 128, 288, 288, 160, 2048] ++ - [15, 8882.71] ++ - - [256, 2048, 1, 256, 288, 288, 288, 2048] ++ - [6, 14686.3] ++ - - [256, 2048, 1, 512, 288, 288, 544, 2048] ++ - [21, 23356.5] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] ++ - [78, 30780.4] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 35913.5] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [31, 37662.0] ++ - - [256, 4096, 1, 64, 288, 288, 96, 4096] ++ - [50, 9097.04] ++ - - [256, 4096, 1, 128, 288, 288, 160, 4096] ++ - [85, 15353.2] ++ - - [256, 4096, 1, 256, 288, 288, 288, 4096] ++ - [63, 22876.8] ++ - - [256, 4096, 1, 512, 288, 288, 544, 4096] ++ - [81, 30093.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] ++ - [55, 35581.5] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] ++ - [63, 37636.2] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [93, 40145.2] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [0, 237.557] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [67, 427.525] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [68, 727.59] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [68, 1096.26] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [87, 1486.9] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [76, 1850.63] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [59, 1979.19] ++ - - [384, 128, 1, 64, 416, 416, 96, 128] ++ - [41, 594.993] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [39, 1042.32] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [6, 1692.39] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [51, 2483.31] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [88, 3348.08] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [76, 3905.62] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [40, 4033.15] ++ - - [384, 256, 1, 64, 416, 416, 96, 256] ++ - [76, 1156.52] ++ - - [384, 256, 1, 128, 416, 416, 160, 256] ++ - [39, 2067.52] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [41, 3255.18] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [51, 4899.89] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [50, 6593.95] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [88, 7707.46] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [88, 8029.62] ++ - - [384, 512, 1, 64, 416, 416, 96, 512] ++ - [86, 2072.28] ++ - - [384, 512, 1, 128, 416, 416, 160, 512] ++ - [85, 3562.05] ++ - - [384, 512, 1, 256, 416, 416, 288, 512] ++ - [51, 6016.21] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [88, 9195.51] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [51, 12683.6] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [87, 15155.6] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [88, 16740.3] ++ - - [384, 1024, 1, 64, 416, 416, 96, 1024] ++ - [30, 3623.07] ++ - - [384, 1024, 1, 128, 416, 416, 160, 1024] ++ - [28, 6477.69] ++ - - [384, 1024, 1, 256, 416, 416, 288, 1024] ++ - [90, 11132.9] ++ - - [384, 1024, 1, 512, 416, 416, 544, 1024] ++ - [32, 17228.0] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [26, 22142.1] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [34, 26662.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [90, 28382.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 2048] ++ - [77, 8056.93] ++ - - [384, 2048, 1, 128, 416, 416, 160, 2048] ++ - [75, 13508.3] ++ - - [384, 2048, 1, 256, 416, 416, 288, 2048] ++ - [81, 19806.0] ++ - - [384, 2048, 1, 512, 416, 416, 544, 2048] ++ - [81, 28038.0] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] ++ - [81, 33376.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [93, 36319.3] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [93, 38891.5] ++ - - [384, 4096, 1, 64, 416, 416, 96, 4096] ++ - [74, 11956.7] ++ - - [384, 4096, 1, 128, 416, 416, 160, 4096] ++ - [73, 19519.7] ++ - - [384, 4096, 1, 256, 416, 416, 288, 4096] ++ - [85, 26332.7] ++ - - [384, 4096, 1, 512, 416, 416, 544, 4096] ++ - [13, 31788.8] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] ++ - [74, 34219.6] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] ++ - [73, 37183.3] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [41, 38906.3] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [3, 518.842] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [36, 858.785] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [36, 1516.56] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [76, 2285.93] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [59, 3034.95] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [40, 3732.01] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [58, 3974.32] ++ - - [768, 128, 1, 64, 800, 800, 96, 128] ++ - [76, 1141.62] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [38, 2028.19] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [38, 3342.07] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [50, 4851.24] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [77, 6556.16] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [59, 7680.13] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [41, 7994.71] ++ - - [768, 256, 1, 64, 800, 800, 96, 256] ++ - [85, 2034.42] ++ - - [768, 256, 1, 128, 800, 800, 160, 256] ++ - [40, 3648.81] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [50, 5934.64] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [50, 9104.88] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [50, 12528.1] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [76, 14913.6] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [76, 16691.3] ++ - - [768, 512, 1, 64, 800, 800, 96, 512] ++ - [19, 3772.42] ++ - - [768, 512, 1, 128, 800, 800, 160, 512] ++ - [85, 6467.7] ++ - - [768, 512, 1, 256, 800, 800, 288, 512] ++ - [20, 10793.8] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [70, 17176.6] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [15, 22504.6] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [31, 27179.7] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [31, 28915.4] ++ - - [768, 1024, 1, 64, 800, 800, 96, 1024] ++ - [84, 8033.78] ++ - - [768, 1024, 1, 128, 800, 800, 160, 1024] ++ - [29, 13200.0] ++ - - [768, 1024, 1, 256, 800, 800, 288, 1024] ++ - [81, 19763.1] ++ - - [768, 1024, 1, 512, 800, 800, 544, 1024] ++ - [82, 27977.6] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [93, 33266.1] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [93, 35939.2] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [93, 39032.4] ++ - - [768, 2048, 1, 64, 800, 800, 96, 2048] ++ - [73, 12340.7] ++ - - [768, 2048, 1, 128, 800, 800, 160, 2048] ++ - [13, 19012.8] ++ - - [768, 2048, 1, 256, 800, 800, 288, 2048] ++ - [94, 26329.3] ++ - - [768, 2048, 1, 512, 800, 800, 544, 2048] ++ - [81, 32810.8] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] ++ - [93, 35496.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [46, 38748.8] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [94, 40602.6] ++ - - [768, 4096, 1, 64, 800, 800, 96, 4096] ++ - [92, 16915.4] ++ - - [768, 4096, 1, 128, 800, 800, 160, 4096] ++ - [83, 24720.8] ++ - - [768, 4096, 1, 256, 800, 800, 288, 4096] ++ - [63, 31396.0] ++ - - [768, 4096, 1, 512, 800, 800, 544, 4096] ++ - [46, 35174.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] ++ - [63, 39080.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] ++ - [81, 41196.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [93, 41732.1] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [0, 961.555] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [85, 1732.47] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [87, 2935.82] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [68, 4522.97] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [58, 6027.74] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [58, 7438.1] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [50, 7991.38] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 128] ++ - [41, 1995.39] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [38, 3640.9] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [87, 6077.99] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [50, 9274.33] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [58, 12202.4] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [41, 14979.7] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [41, 16861.9] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 256] ++ - [36, 3744.36] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 256] ++ - [60, 6428.89] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [89, 11314.4] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [32, 16829.1] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [42, 22465.7] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [4, 26640.2] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [52, 28402.9] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 512] ++ - [87, 7699.5] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 512] ++ - [75, 12940.4] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 512] ++ - [72, 20375.1] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [93, 27291.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [63, 33830.8] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 36211.5] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [93, 38960.7] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] ++ - [83, 11834.4] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] ++ - [57, 19533.0] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] ++ - [22, 25987.7] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] ++ - [81, 33271.7] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [63, 35717.6] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [63, 38785.7] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [94, 40624.1] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] ++ - [92, 17182.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] ++ - [75, 25107.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] ++ - [63, 31219.5] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] ++ - [81, 35398.9] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] ++ - [81, 39111.6] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [81, 41169.6] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [93, 41751.3] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] ++ - [91, 22034.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] ++ - [85, 29172.5] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] ++ - [93, 33740.7] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] ++ - [93, 38246.9] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] ++ - [63, 40816.9] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] ++ - [94, 41277.3] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [46, 41750.4] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [35, 1578.0] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [4, 2931.37] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [4, 4832.16] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [27, 7409.34] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [11, 10308.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [78, 12814.4] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [42, 13927.1] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 128] ++ - [36, 3979.43] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [43, 6292.26] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [31, 11131.6] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [6, 16619.3] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [60, 22457.0] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [4, 26709.1] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [4, 28908.6] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 256] ++ - [58, 7641.06] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 256] ++ - [61, 13427.1] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [63, 20288.9] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [64, 27274.5] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [63, 33859.2] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [47, 36139.2] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [72, 38961.1] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 512] ++ - [65, 11867.9] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 512] ++ - [83, 19514.1] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 512] ++ - [63, 25869.2] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [63, 33237.3] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [63, 35725.5] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [64, 38951.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [72, 40659.2] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] ++ - [80, 17518.9] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] ++ - [57, 24552.1] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] ++ - [81, 31895.9] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] ++ - [81, 35489.4] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [81, 39129.6] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [81, 41200.3] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [93, 41732.1] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] ++ - [80, 21727.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] ++ - [75, 29237.1] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] ++ - [93, 33840.7] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] ++ - [81, 38302.4] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] ++ - [63, 40853.4] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [81, 41506.4] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [63, 41389.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] ++ - [45, 6763.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] ++ - [53, 12835.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] ++ - [45, 23148.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] ++ - [92, 35655.0] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] ++ - [93, 39466.7] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] ++ - [64, 41295.0] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [81, 41849.2] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [78, 1791.1] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [10, 3576.09] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [14, 5959.42] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [14, 9558.99] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 13081.7] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [12, 16531.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [11, 18058.3] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 128] ++ - [68, 5773.3] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [42, 10035.7] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [12, 16141.6] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [24, 23615.3] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [3, 30533.6] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [42, 35860.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [0, 37730.8] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 256] ++ - [68, 9804.11] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 256] ++ - [62, 15429.1] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [93, 22639.5] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [63, 30681.9] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [81, 35673.7] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [63, 37833.2] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [46, 40145.6] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 512] ++ - [71, 13770.2] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 512] ++ - [75, 21377.4] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 512] ++ - [63, 28915.4] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [81, 35019.8] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [63, 37461.6] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [81, 40218.5] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [93, 41715.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] ++ - [80, 18825.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] ++ - [75, 27206.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] ++ - [81, 33398.9] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] ++ - [63, 36738.7] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [81, 39857.6] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [81, 41701.6] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [63, 41326.1] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] ++ - [91, 23035.8] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] ++ - [85, 30768.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] ++ - [35, 34131.5] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] ++ - [88, 37340.4] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] ++ - [94, 39941.3] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [93, 40407.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [81, 41869.1] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] ++ - [45, 7040.61] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] ++ - [53, 13693.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] ++ - [71, 24216.1] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] ++ - [54, 36638.2] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] ++ - [64, 38693.3] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] ++ - [81, 41280.4] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [93, 41825.4] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_SB.yaml +new file mode 100644 +index 00000000..d5ea89ca +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bjlk_SB.yaml +@@ -0,0 +1,310 @@ ++- {MinimumRequiredVersion: 4.33.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 8 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: false ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 1 ++ GlobalLoadVectorWidthB: 1 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 1 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 8 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 8 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 32 ++ LVPB: 8 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 512 ++ LdsOffsetA: 0 ++ LdsOffsetB: 256 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 1 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 8 ++ LoopTail: true ++ LoopUnroll: 8 ++ MACInstruction: FMA ++ MIArchVgpr: false ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstruction: [] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: false ++ PrefetchLocalRead: true ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [1, 3, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 1 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: true ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: true ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_ ++ SourceSwap: false ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [2, 2] ++ ThreadTile0: 2 ++ ThreadTile1: 2 ++ ThreadTileA: 2 ++ ThreadTileB: 2 ++ TransposeLDS: 0 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: 0 ++ UnrollMajorLDSB: 0 ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 8 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [126, 126, 2, 66, 126, 126, 66, 126] ++ - [0, 0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH.yaml +new file mode 100644 +index 00000000..dfc204bb +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH.yaml +@@ -0,0 +1,9213 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [3, 32.605] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [23, 61.5651] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [11, 107.249] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [12, 161.942] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 232.397] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 296.25] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [11, 333.861] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [12, 60.4855] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [23, 113.765] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [11, 199.482] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [11, 314.321] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [22, 456.548] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [11, 587.335] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [26, 658.123] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [10, 132.765] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [23, 247.188] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [3, 429.524] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [20, 685.848] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [9, 969.333] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 1229.55] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [11, 1331.68] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [12, 297.131] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [20, 547.202] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [20, 939.897] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [29, 1478.43] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [20, 2050.63] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [7, 2550.6] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [9, 2824.33] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [9, 609.637] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [20, 1124.93] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [5, 1866.0] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [20, 2949.32] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [5, 4099.75] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [5, 5132.02] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [2, 5855.53] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [10, 1154.98] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [20, 2129.36] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [12, 3665.55] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [5, 5800.25] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [9, 8203.52] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [5, 10521.5] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [20, 12185.0] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [8, 2075.36] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [4, 3737.82] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [8, 6204.58] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [22, 9318.73] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [7, 12865.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 15730.2] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [8, 16167.9] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [23, 69.1126] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [23, 131.039] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [10, 224.775] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [12, 340.143] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [22, 462.105] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [7, 590.934] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [7, 657.35] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [29, 144.771] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [12, 292.817] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [7, 494.845] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [7, 761.562] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 1041.42] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [22, 1277.92] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [11, 1372.37] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [12, 335.221] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [12, 633.485] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [23, 1061.85] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [22, 1614.75] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [14, 2150.92] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [28, 2624.31] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [2, 2868.7] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [10, 698.236] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [12, 1264.68] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [11, 2119.41] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [2, 3233.85] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 4346.43] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 5289.16] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [28, 5909.29] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [10, 1407.01] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [12, 2551.66] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [17, 4451.37] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [16, 6727.71] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [22, 8930.58] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 10861.7] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [11, 12324.6] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [15, 2711.69] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [3, 4708.73] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [7, 7969.23] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [9, 12252.8] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 16982.1] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [5, 21355.2] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [26, 24447.7] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [18, 4321.24] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [30, 7756.46] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [25, 12868.4] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [2, 19392.8] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 26589.6] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [7, 31724.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [8, 32440.3] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [12, 146.102] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [9, 273.458] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [7, 470.583] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [26, 740.978] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [28, 971.408] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [28, 1224.61] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [2, 1327.1] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [12, 331.512] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [1, 631.102] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [3, 1016.55] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [7, 1560.53] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [22, 2116.6] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 2578.83] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [7, 2809.84] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [12, 698.12] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [6, 1256.34] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [7, 2103.73] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [11, 3194.75] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [2, 4311.25] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 5203.24] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [2, 5754.24] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [23, 1416.03] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [12, 2546.63] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [2, 4239.35] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [22, 6454.02] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [7, 8655.84] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 10428.3] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [28, 11588.0] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [23, 2638.34] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [21, 4975.45] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [22, 8344.8] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [22, 12664.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [7, 16999.3] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [22, 21029.8] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [28, 23312.3] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [26, 4375.9] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [11, 7866.47] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [2, 13016.9] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [11, 19505.5] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [11, 26431.2] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 31527.8] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [28, 33907.8] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [12, 8261.59] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [11, 13575.2] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [3, 20254.7] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [2, 26952.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [11, 32097.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 34026.0] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 34426.4] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [12, 224.984] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [12, 419.766] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [7, 749.784] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [14, 1117.59] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [20, 1487.87] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [28, 1844.93] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [11, 1994.4] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [23, 507.785] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [21, 972.852] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [11, 1618.38] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [11, 2438.79] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [11, 3266.8] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [28, 3916.71] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [2, 4290.85] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [10, 1059.17] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [17, 1905.06] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [29, 3160.34] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [11, 4812.28] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [11, 6441.63] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 7810.93] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [2, 8601.67] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [29, 2043.34] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [29, 3842.7] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [2, 6398.63] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [11, 9694.08] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [0, 12924.6] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 15635.8] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 16866.8] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [22, 3565.57] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [28, 6357.41] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [11, 10421.7] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [0, 15282.1] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [2, 20423.7] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [16, 23986.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 25670.0] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [27, 5651.43] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [21, 10570.5] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [23, 17270.9] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [0, 23414.2] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 28715.8] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 30728.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 33564.9] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [23, 9321.54] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [23, 14657.9] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [3, 20453.8] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [23, 25958.3] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 28841.3] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [11, 31910.1] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 32506.7] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [3, 488.847] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [14, 902.389] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [7, 1538.82] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [9, 2232.6] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 2967.66] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 3708.08] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [9, 3979.81] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [29, 992.97] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [10, 1889.89] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [0, 3151.64] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [22, 4777.57] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 6421.49] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 7729.65] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 8118.82] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [23, 2130.53] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [15, 3825.17] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [2, 6370.29] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [2, 9640.21] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 12865.9] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [11, 15514.7] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [16, 16705.9] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [16, 3582.83] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [22, 6368.68] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [2, 10405.6] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [2, 15381.4] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 20583.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [8, 24014.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [8, 25511.4] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [10, 5691.05] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [10, 10631.9] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [3, 16513.0] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [20, 22687.2] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [9, 27700.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [28, 30531.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 33538.3] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [28, 10093.6] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [16, 16134.5] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [23, 22714.1] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [2, 28840.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 31995.9] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 35301.5] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 37370.5] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [12, 12722.1] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [2, 19122.0] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [23, 25389.5] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [2, 29809.0] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [28, 34232.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [28, 37178.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [28, 36702.3] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [2, 927.67] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [9, 1734.14] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [0, 2950.96] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [9, 4341.56] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [2, 5924.16] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [2, 7453.77] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 8251.09] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [19, 1893.59] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [10, 3619.94] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [22, 6060.4] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [11, 9258.08] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [22, 12492.3] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [11, 15425.5] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [11, 17053.3] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [24, 3418.81] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [22, 6120.84] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [22, 10016.2] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [13, 14862.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 19838.0] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 24074.2] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [4, 25641.8] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [29, 5762.73] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [2, 10113.9] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [23, 17705.3] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [2, 23522.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [11, 28515.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [22, 30715.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 33101.7] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [26, 9838.09] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [28, 15797.8] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [23, 22503.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [20, 28510.5] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [22, 32034.1] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [22, 35425.7] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 37301.5] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [23, 12454.5] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [23, 19030.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [29, 25477.1] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [20, 30053.8] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [28, 34385.4] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 37109.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [11, 37440.0] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [12, 14424.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [3, 19898.3] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [23, 26513.5] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [22, 32001.3] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [28, 35774.7] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 37333.3] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [14, 37436.3] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [25, 1746.17] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [14, 3297.84] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [13, 4905.14] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [4, 7327.36] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 9869.43] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [4, 11835.8] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [8, 12559.4] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [11, 3175.1] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [8, 6196.19] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [25, 10195.8] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [25, 15087.4] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 20395.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [25, 24136.3] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [4, 25545.4] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [13, 5372.15] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [21, 9947.95] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [29, 16228.1] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [10, 21933.4] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [12, 26465.1] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [12, 28809.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [15, 30508.6] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [14, 9767.45] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [11, 15716.3] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [12, 22378.3] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [17, 28223.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [23, 31268.6] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [23, 34506.6] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [11, 35980.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [28, 12415.3] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [22, 18968.9] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [17, 25147.7] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [17, 29451.5] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [22, 33769.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [11, 36427.6] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 36175.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [21, 14349.7] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [17, 19868.9] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [21, 26205.0] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [20, 31647.7] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [28, 35441.9] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [28, 37186.9] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 36606.5] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [23, 15083.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [23, 21854.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [23, 27887.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [28, 32940.6] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [28, 35740.6] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 36999.3] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 35682.4] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [25, 1794.16] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [13, 3624.36] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [8, 6047.48] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [18, 9277.51] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [8, 12762.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [13, 15591.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 16424.1] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [22, 5059.46] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [11, 9081.04] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [4, 14506.9] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [28, 21203.4] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [20, 27458.6] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [11, 31860.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 32544.0] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [12, 7860.94] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [29, 13104.6] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [28, 19780.1] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [23, 26154.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [23, 30932.9] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [22, 33237.6] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [4, 34493.3] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [11, 10777.1] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [9, 17045.7] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [29, 23636.1] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [12, 28937.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [12, 32215.5] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [28, 35312.6] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 35733.6] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [27, 13237.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [29, 19938.0] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [23, 24864.9] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [20, 30627.6] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [16, 34561.3] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [28, 37052.7] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 35549.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [23, 15046.8] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [23, 20856.2] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [23, 27016.4] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [22, 32297.4] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [28, 35954.1] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [11, 37291.7] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [30, 35673.4] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [23, 15017.8] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [16, 21305.2] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [11, 27487.1] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [28, 33021.3] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [28, 35981.0] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [11, 37429.5] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 35235.8] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH_GB.yaml +new file mode 100644 +index 00000000..93a89ce3 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_BBS_BH_GB.yaml +@@ -0,0 +1,9213 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 7 ++ DestDataType: 7 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [3, 32.605] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [23, 61.5651] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [11, 107.249] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [12, 161.942] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 232.397] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 296.25] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [11, 333.861] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [12, 60.4855] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [23, 113.765] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [11, 199.482] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [11, 314.321] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [22, 456.548] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [11, 587.335] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [26, 658.123] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [10, 132.765] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [23, 247.188] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [3, 429.524] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [20, 685.848] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [9, 969.333] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 1229.55] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [11, 1331.68] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [12, 297.131] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [20, 547.202] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [20, 939.897] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [29, 1478.43] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [20, 2050.63] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [7, 2550.6] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [9, 2824.33] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [9, 609.637] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [20, 1124.93] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [5, 1866.0] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [20, 2949.32] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [5, 4099.75] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [5, 5132.02] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [2, 5855.53] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [10, 1154.98] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [20, 2129.36] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [12, 3665.55] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [5, 5800.25] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [9, 8203.52] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [5, 10521.5] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [20, 12185.0] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [8, 2075.36] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [4, 3737.82] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [8, 6204.58] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [22, 9318.73] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [7, 12865.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 15730.2] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [8, 16167.9] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [23, 69.1126] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [23, 131.039] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [10, 224.775] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [12, 340.143] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [22, 462.105] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [7, 590.934] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [7, 657.35] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [29, 144.771] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [12, 292.817] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [7, 494.845] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [7, 761.562] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 1041.42] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [22, 1277.92] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [11, 1372.37] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [12, 335.221] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [12, 633.485] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [23, 1061.85] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [22, 1614.75] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [14, 2150.92] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [28, 2624.31] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [2, 2868.7] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [10, 698.236] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [12, 1264.68] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [11, 2119.41] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [2, 3233.85] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 4346.43] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 5289.16] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [28, 5909.29] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [10, 1407.01] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [12, 2551.66] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [17, 4451.37] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [16, 6727.71] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [22, 8930.58] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 10861.7] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [11, 12324.6] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [15, 2711.69] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [3, 4708.73] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [7, 7969.23] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [9, 12252.8] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 16982.1] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [5, 21355.2] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [26, 24447.7] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [18, 4321.24] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [30, 7756.46] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [25, 12868.4] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [2, 19392.8] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 26589.6] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [7, 31724.3] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [8, 32440.3] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [12, 146.102] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [9, 273.458] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [7, 470.583] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [26, 740.978] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [28, 971.408] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [28, 1224.61] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [2, 1327.1] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [12, 331.512] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [1, 631.102] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [3, 1016.55] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [7, 1560.53] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [22, 2116.6] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 2578.83] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [7, 2809.84] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [12, 698.12] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [6, 1256.34] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [7, 2103.73] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [11, 3194.75] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [2, 4311.25] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 5203.24] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [2, 5754.24] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [23, 1416.03] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [12, 2546.63] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [2, 4239.35] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [22, 6454.02] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [7, 8655.84] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 10428.3] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [28, 11588.0] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [23, 2638.34] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [21, 4975.45] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [22, 8344.8] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [22, 12664.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [7, 16999.3] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [22, 21029.8] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [28, 23312.3] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [26, 4375.9] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [11, 7866.47] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [2, 13016.9] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [11, 19505.5] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [11, 26431.2] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 31527.8] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [28, 33907.8] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [12, 8261.59] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [11, 13575.2] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [3, 20254.7] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [2, 26952.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [11, 32097.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 34026.0] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 34426.4] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [12, 224.984] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [12, 419.766] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [7, 749.784] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [14, 1117.59] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [20, 1487.87] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [28, 1844.93] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [11, 1994.4] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [23, 507.785] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [21, 972.852] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [11, 1618.38] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [11, 2438.79] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [11, 3266.8] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [28, 3916.71] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [2, 4290.85] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [10, 1059.17] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [17, 1905.06] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [29, 3160.34] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [11, 4812.28] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [11, 6441.63] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 7810.93] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [2, 8601.67] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [29, 2043.34] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [29, 3842.7] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [2, 6398.63] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [11, 9694.08] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [0, 12924.6] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 15635.8] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 16866.8] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [22, 3565.57] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [28, 6357.41] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [11, 10421.7] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [0, 15282.1] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [2, 20423.7] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [16, 23986.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 25670.0] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [27, 5651.43] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [21, 10570.5] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [23, 17270.9] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [0, 23414.2] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 28715.8] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 30728.1] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 33564.9] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [23, 9321.54] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [23, 14657.9] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [3, 20453.8] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [23, 25958.3] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [22, 28841.3] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [11, 31910.1] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [22, 32506.7] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [3, 488.847] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [14, 902.389] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [7, 1538.82] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [9, 2232.6] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 2967.66] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 3708.08] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [9, 3979.81] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [29, 992.97] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [10, 1889.89] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [0, 3151.64] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [22, 4777.57] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 6421.49] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 7729.65] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 8118.82] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [23, 2130.53] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [15, 3825.17] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [2, 6370.29] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [2, 9640.21] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 12865.9] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [11, 15514.7] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [16, 16705.9] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [16, 3582.83] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [22, 6368.68] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [2, 10405.6] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [2, 15381.4] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [2, 20583.4] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [8, 24014.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [8, 25511.4] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [10, 5691.05] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [10, 10631.9] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [3, 16513.0] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [20, 22687.2] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [9, 27700.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [28, 30531.8] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 33538.3] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [28, 10093.6] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [16, 16134.5] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [23, 22714.1] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [2, 28840.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [22, 31995.9] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 35301.5] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [22, 37370.5] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [12, 12722.1] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [2, 19122.0] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [23, 25389.5] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [2, 29809.0] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [28, 34232.7] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [28, 37178.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [28, 36702.3] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [2, 927.67] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [9, 1734.14] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [0, 2950.96] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [9, 4341.56] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [2, 5924.16] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [2, 7453.77] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 8251.09] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [19, 1893.59] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [10, 3619.94] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [22, 6060.4] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [11, 9258.08] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [22, 12492.3] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [11, 15425.5] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [11, 17053.3] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [24, 3418.81] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [22, 6120.84] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [22, 10016.2] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [13, 14862.4] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 19838.0] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 24074.2] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [4, 25641.8] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [29, 5762.73] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [2, 10113.9] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [23, 17705.3] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [2, 23522.2] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [11, 28515.5] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [22, 30715.2] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 33101.7] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [26, 9838.09] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [28, 15797.8] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [23, 22503.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [20, 28510.5] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [22, 32034.1] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [22, 35425.7] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [22, 37301.5] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [23, 12454.5] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [23, 19030.8] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [29, 25477.1] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [20, 30053.8] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [28, 34385.4] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 37109.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [11, 37440.0] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [12, 14424.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [3, 19898.3] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [23, 26513.5] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [22, 32001.3] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [28, 35774.7] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [28, 37333.3] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [14, 37436.3] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [25, 1746.17] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [14, 3297.84] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [13, 4905.14] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [4, 7327.36] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 9869.43] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [4, 11835.8] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [8, 12559.4] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [11, 3175.1] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [8, 6196.19] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [25, 10195.8] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [25, 15087.4] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 20395.8] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [25, 24136.3] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [4, 25545.4] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [13, 5372.15] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [21, 9947.95] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [29, 16228.1] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [10, 21933.4] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [12, 26465.1] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [12, 28809.3] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [15, 30508.6] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [14, 9767.45] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [11, 15716.3] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [12, 22378.3] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [17, 28223.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [23, 31268.6] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [23, 34506.6] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [11, 35980.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [28, 12415.3] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [22, 18968.9] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [17, 25147.7] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [17, 29451.5] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [22, 33769.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [11, 36427.6] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 36175.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [21, 14349.7] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [17, 19868.9] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [21, 26205.0] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [20, 31647.7] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [28, 35441.9] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [28, 37186.9] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 36606.5] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [23, 15083.2] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [23, 21854.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [23, 27887.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [28, 32940.6] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [28, 35740.6] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 36999.3] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 35682.4] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [25, 1794.16] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [13, 3624.36] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [8, 6047.48] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [18, 9277.51] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [8, 12762.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [13, 15591.3] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 16424.1] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [22, 5059.46] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [11, 9081.04] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [4, 14506.9] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [28, 21203.4] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [20, 27458.6] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [11, 31860.8] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 32544.0] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [12, 7860.94] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [29, 13104.6] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [28, 19780.1] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [23, 26154.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [23, 30932.9] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [22, 33237.6] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [4, 34493.3] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [11, 10777.1] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [9, 17045.7] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [29, 23636.1] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [12, 28937.1] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [12, 32215.5] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [28, 35312.6] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 35733.6] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [27, 13237.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [29, 19938.0] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [23, 24864.9] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [20, 30627.6] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [16, 34561.3] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [28, 37052.7] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [22, 35549.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [23, 15046.8] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [23, 20856.2] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [23, 27016.4] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [22, 32297.4] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [28, 35954.1] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [11, 37291.7] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [30, 35673.4] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [23, 15017.8] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [16, 21305.2] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [11, 27487.1] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [28, 33021.3] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [28, 35981.0] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [11, 37429.5] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [25, 35235.8] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB.yaml +new file mode 100644 +index 00000000..fc8659a8 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB.yaml +@@ -0,0 +1,10833 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [9, 37.8602] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [14, 65.8325] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [25, 122.426] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [25, 179.859] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [14, 254.679] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 319.883] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [7, 353.749] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [26, 70.5447] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [15, 131.615] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [15, 227.31] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [25, 358.457] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [14, 504.974] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [14, 633.581] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [19, 689.315] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [15, 152.898] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [15, 284.36] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [2, 485.509] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [25, 761.355] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [0, 1056.9] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [25, 1311.9] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [33, 1456.67] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [15, 340.835] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [33, 624.992] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [25, 1061.85] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [25, 1640.17] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [0, 2225.98] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 2695.79] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [12, 3035.5] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [33, 669.164] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [8, 1225.33] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [14, 2080.26] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [0, 3221.12] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [25, 4408.1] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [23, 5438.1] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [23, 6197.29] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [21, 1316.07] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [25, 2525.18] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [2, 4270.64] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [0, 6549.14] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 8938.31] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [7, 11133.3] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [12, 12722.7] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [35, 2364.99] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [10, 4212.73] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [21, 6840.18] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [35, 10062.8] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [14, 13495.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 16146.5] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [35, 16494.5] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [29, 79.4617] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [25, 150.399] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [25, 256.847] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [0, 375.162] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 524.715] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [14, 620.782] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [8, 685.793] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [3, 177.274] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [26, 327.016] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [3, 567.567] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [25, 845.117] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [25, 1118.33] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [25, 1346.16] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [19, 1489.49] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [3, 398.547] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [9, 716.608] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [25, 1176.85] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [14, 1754.02] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [25, 2312.99] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 2742.38] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [33, 2995.33] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [34, 810.024] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [3, 1445.81] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [0, 2378.73] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [7, 3539.12] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [19, 4630.11] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [7, 5484.1] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [8, 6080.64] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [26, 1534.7] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [9, 2731.11] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [2, 4536.22] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [8, 6866.76] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [0, 9172.26] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [31, 11117.7] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [31, 12540.2] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [24, 2902.63] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [18, 5224.11] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [14, 8735.89] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [0, 13252.1] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [0, 18147.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [31, 22495.2] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [2, 25333.1] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [35, 4672.02] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [5, 8332.36] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [21, 13577.9] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [2, 20373.1] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [14, 27107.9] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [19, 32195.2] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [21, 33035.2] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [19, 169.098] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [25, 313.008] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [14, 531.395] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [33, 789.221] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [25, 1033.85] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [19, 1289.42] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [25, 1421.59] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [26, 390.749] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [26, 712.953] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [25, 1173.89] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [8, 1742.18] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [19, 2271.95] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [14, 2672.07] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [19, 2958.23] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [20, 772.289] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [9, 1378.12] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [14, 2276.11] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [33, 3401.71] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [25, 4504.56] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 5361.64] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [14, 5954.12] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [26, 1529.37] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [24, 2745.87] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [25, 4544.21] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [25, 6805.48] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [14, 9055.92] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 10734.9] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 12059.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [15, 2965.22] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [26, 5321.9] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [25, 8832.47] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [14, 13335.1] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [14, 17805.5] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 21915.8] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [33, 24428.2] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [9, 4716.03] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [20, 8423.35] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [2, 13619.3] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [29, 20285.3] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [2, 27395.6] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [8, 31892.1] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 33930.4] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [22, 8287.09] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [35, 13450.0] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [14, 21338.3] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [14, 27927.2] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [14, 33082.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [25, 34610.6] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [35, 34967.4] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [14, 272.689] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [14, 481.221] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [14, 850.772] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [2, 1233.5] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [0, 1625.38] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [8, 1955.69] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [14, 2147.85] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [20, 563.852] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [15, 1030.38] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [2, 1694.9] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [25, 2533.3] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [31, 3404.47] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 4058.52] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [2, 4466.19] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [9, 1224.02] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [26, 2170.23] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [8, 3576.74] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [0, 5238.51] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [2, 6888.61] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [14, 8029.63] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [8, 8893.91] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [6, 2194.44] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [3, 3955.03] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [0, 6587.07] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [25, 9898.06] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [12, 13166.3] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [14, 15893.8] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [8, 17697.1] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [9, 3731.59] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [8, 6594.82] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [14, 10668.0] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [0, 15673.5] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [21, 20627.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [16, 24685.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [21, 25930.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [15, 6765.92] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [20, 12091.7] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [24, 18677.7] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [2, 24898.2] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 29818.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [25, 31668.8] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [33, 33968.5] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [36, 10665.7] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [26, 16547.0] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [24, 22545.0] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [23, 28061.4] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [14, 30385.5] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 32994.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [25, 32801.1] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [14, 536.814] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [14, 984.428] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [14, 1675.93] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [17, 2488.46] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [33, 3245.11] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [33, 3920.83] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [14, 4339.41] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [1, 1183.28] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [26, 2059.4] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [25, 3388.88] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [2, 5063.55] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [14, 6704.19] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [2, 7986.31] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 8856.15] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [11, 2193.29] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [3, 3941.4] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [0, 6520.51] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [14, 9854.46] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [14, 13130.3] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [14, 15814.5] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 17650.2] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [24, 3741.57] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [12, 6590.5] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [25, 10966.7] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [8, 16063.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [10, 20924.7] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [10, 24520.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 26012.5] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [18, 6115.63] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [18, 11615.9] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [13, 17879.8] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [25, 24815.3] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [25, 28872.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [33, 31021.1] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 33823.3] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [24, 11593.2] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [34, 17838.7] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [26, 24790.9] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [2, 30657.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [25, 33062.7] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [25, 35924.4] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 37545.9] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [15, 15421.4] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [13, 22172.6] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [26, 28660.7] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [2, 32062.2] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [33, 35550.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [33, 37830.3] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 37068.2] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [8, 1066.17] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [19, 1967.31] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [19, 3327.94] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [2, 5014.61] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 6433.81] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [2, 7917.83] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 8640.82] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [20, 2230.22] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [34, 4043.35] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [14, 6688.59] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [12, 10009.3] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [14, 13293.3] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [14, 16121.6] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 17756.8] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [32, 3849.77] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [25, 6796.06] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [29, 10933.3] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [16, 16056.1] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [29, 20863.9] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 24714.8] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [5, 25907.0] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [18, 5968.43] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [34, 11440.3] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [24, 17580.1] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [18, 24077.8] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 28836.2] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 30972.8] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 33604.2] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [26, 11567.9] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [26, 17898.9] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [26, 24498.3] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [25, 30336.3] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 32934.2] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 35982.9] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [25, 37712.9] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [24, 15650.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [1, 22274.3] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [34, 28297.1] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [23, 31688.2] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [33, 35575.6] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 37895.5] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [19, 37782.0] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [32, 18261.7] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [34, 25219.4] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [34, 29638.0] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [25, 34177.5] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [33, 37186.6] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [33, 38147.5] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [19, 38089.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [27, 1989.71] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [2, 3580.29] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [0, 5465.5] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [16, 7550.5] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [29, 10205.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [21, 12080.8] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [29, 12805.6] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [28, 3579.27] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [5, 6428.87] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [16, 10488.0] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [35, 15480.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [29, 20956.3] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 24824.5] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [35, 26187.6] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [13, 6271.09] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [26, 11432.5] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [18, 17802.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [25, 22446.9] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [24, 26876.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [24, 28742.0] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 30332.9] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [13, 11508.4] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [26, 17822.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [34, 24313.4] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [29, 30206.6] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [16, 32411.9] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [14, 35082.0] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [1, 36208.8] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [26, 15303.1] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [26, 22145.7] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [34, 28056.6] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [25, 31658.9] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 35095.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 37139.4] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 36528.9] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [26, 18370.1] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [34, 25143.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [26, 29435.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [25, 33958.4] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 36889.1] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 37955.9] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [0, 36818.5] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [24, 19499.9] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [26, 26247.8] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [34, 31503.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [33, 35406.7] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 37185.2] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 38019.2] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 37061.7] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [4, 2542.4] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [29, 4318.47] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [29, 7062.62] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [21, 10302.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [14, 13399.7] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [21, 15871.1] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 16690.9] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [9, 5849.8] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [25, 10163.4] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [16, 16085.5] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [25, 22880.6] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [16, 28819.1] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [5, 32859.3] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [10, 33157.1] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [26, 9037.04] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [26, 14737.9] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [14, 21304.5] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [25, 27288.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 31908.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [16, 33568.6] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 34921.6] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [18, 13443.3] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [34, 20121.1] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [20, 25915.8] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [33, 30881.3] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 33258.8] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [25, 35831.5] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [16, 36485.9] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [34, 16655.5] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [26, 23495.5] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [32, 29021.7] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [25, 32596.0] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 35983.6] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [33, 37809.4] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 36173.4] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [32, 19307.1] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [14, 24275.3] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [26, 30362.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [33, 34695.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 37399.8] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [19, 38019.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 36454.0] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [30, 18500.7] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [13, 24188.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [14, 30466.2] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [25, 35442.2] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 37396.6] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [14, 38168.5] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 36101.8] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB_GB.yaml +new file mode 100644 +index 00000000..17a004bc +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HB_GB.yaml +@@ -0,0 +1,10833 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 4 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [9, 37.8602] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [14, 65.8325] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [25, 122.426] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [25, 179.859] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [14, 254.679] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 319.883] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [7, 353.749] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [26, 70.5447] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [15, 131.615] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [15, 227.31] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [25, 358.457] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [14, 504.974] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [14, 633.581] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [19, 689.315] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [15, 152.898] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [15, 284.36] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [2, 485.509] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [25, 761.355] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [0, 1056.9] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [25, 1311.9] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [33, 1456.67] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [15, 340.835] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [33, 624.992] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [25, 1061.85] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [25, 1640.17] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [0, 2225.98] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 2695.79] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [12, 3035.5] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [33, 669.164] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [8, 1225.33] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [14, 2080.26] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [0, 3221.12] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [25, 4408.1] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [23, 5438.1] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [23, 6197.29] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [21, 1316.07] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [25, 2525.18] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [2, 4270.64] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [0, 6549.14] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 8938.31] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [7, 11133.3] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [12, 12722.7] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [35, 2364.99] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [10, 4212.73] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [21, 6840.18] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [35, 10062.8] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [14, 13495.3] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 16146.5] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [35, 16494.5] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [29, 79.4617] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [25, 150.399] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [25, 256.847] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [0, 375.162] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [2, 524.715] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [14, 620.782] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [8, 685.793] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [3, 177.274] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [26, 327.016] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [3, 567.567] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [25, 845.117] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [25, 1118.33] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [25, 1346.16] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [19, 1489.49] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [3, 398.547] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [9, 716.608] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [25, 1176.85] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [14, 1754.02] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [25, 2312.99] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 2742.38] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [33, 2995.33] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [34, 810.024] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [3, 1445.81] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [0, 2378.73] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [7, 3539.12] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [19, 4630.11] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [7, 5484.1] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [8, 6080.64] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [26, 1534.7] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [9, 2731.11] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [2, 4536.22] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [8, 6866.76] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [0, 9172.26] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [31, 11117.7] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [31, 12540.2] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [24, 2902.63] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [18, 5224.11] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [14, 8735.89] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [0, 13252.1] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [0, 18147.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [31, 22495.2] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [2, 25333.1] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [35, 4672.02] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [5, 8332.36] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [21, 13577.9] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [2, 20373.1] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [14, 27107.9] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [19, 32195.2] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [21, 33035.2] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [19, 169.098] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [25, 313.008] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [14, 531.395] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [33, 789.221] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [25, 1033.85] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [19, 1289.42] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [25, 1421.59] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [26, 390.749] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [26, 712.953] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [25, 1173.89] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [8, 1742.18] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [19, 2271.95] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [14, 2672.07] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [19, 2958.23] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [20, 772.289] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [9, 1378.12] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [14, 2276.11] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [33, 3401.71] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [25, 4504.56] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 5361.64] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [14, 5954.12] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [26, 1529.37] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [24, 2745.87] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [25, 4544.21] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [25, 6805.48] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [14, 9055.92] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [2, 10734.9] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 12059.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [15, 2965.22] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [26, 5321.9] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [25, 8832.47] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [14, 13335.1] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [14, 17805.5] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [7, 21915.8] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [33, 24428.2] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [9, 4716.03] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [20, 8423.35] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [2, 13619.3] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [29, 20285.3] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [2, 27395.6] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [8, 31892.1] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [8, 33930.4] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [22, 8287.09] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [35, 13450.0] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [14, 21338.3] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [14, 27927.2] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [14, 33082.0] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [25, 34610.6] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [35, 34967.4] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [14, 272.689] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [14, 481.221] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [14, 850.772] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [2, 1233.5] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [0, 1625.38] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [8, 1955.69] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [14, 2147.85] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [20, 563.852] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [15, 1030.38] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [2, 1694.9] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [25, 2533.3] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [31, 3404.47] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 4058.52] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [2, 4466.19] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [9, 1224.02] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [26, 2170.23] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [8, 3576.74] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [0, 5238.51] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [2, 6888.61] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [14, 8029.63] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [8, 8893.91] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [6, 2194.44] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [3, 3955.03] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [0, 6587.07] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [25, 9898.06] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [12, 13166.3] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [14, 15893.8] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [8, 17697.1] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [9, 3731.59] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [8, 6594.82] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [14, 10668.0] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [0, 15673.5] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [21, 20627.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [16, 24685.3] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [21, 25930.4] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [15, 6765.92] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [20, 12091.7] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [24, 18677.7] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [2, 24898.2] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 29818.4] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [25, 31668.8] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [33, 33968.5] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [36, 10665.7] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [26, 16547.0] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [24, 22545.0] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [23, 28061.4] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [14, 30385.5] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [2, 32994.6] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [25, 32801.1] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [14, 536.814] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [14, 984.428] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [14, 1675.93] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [17, 2488.46] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [33, 3245.11] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [33, 3920.83] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [14, 4339.41] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [1, 1183.28] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [26, 2059.4] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [25, 3388.88] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [2, 5063.55] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [14, 6704.19] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [2, 7986.31] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 8856.15] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [11, 2193.29] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [3, 3941.4] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [0, 6520.51] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [14, 9854.46] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [14, 13130.3] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [14, 15814.5] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [2, 17650.2] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [24, 3741.57] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [12, 6590.5] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [25, 10966.7] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [8, 16063.7] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [10, 20924.7] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [10, 24520.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 26012.5] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [18, 6115.63] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [18, 11615.9] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [13, 17879.8] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [25, 24815.3] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [25, 28872.4] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [33, 31021.1] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 33823.3] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [24, 11593.2] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [34, 17838.7] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [26, 24790.9] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [2, 30657.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [25, 33062.7] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [25, 35924.4] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 37545.9] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [15, 15421.4] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [13, 22172.6] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [26, 28660.7] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [2, 32062.2] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [33, 35550.9] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [33, 37830.3] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [33, 37068.2] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [8, 1066.17] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [19, 1967.31] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [19, 3327.94] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [2, 5014.61] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 6433.81] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [2, 7917.83] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [17, 8640.82] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [20, 2230.22] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [34, 4043.35] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [14, 6688.59] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [12, 10009.3] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [14, 13293.3] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [14, 16121.6] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 17756.8] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [32, 3849.77] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [25, 6796.06] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [29, 10933.3] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [16, 16056.1] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [29, 20863.9] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [5, 24714.8] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [5, 25907.0] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [18, 5968.43] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [34, 11440.3] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [24, 17580.1] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [18, 24077.8] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 28836.2] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 30972.8] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [2, 33604.2] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [26, 11567.9] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [26, 17898.9] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [26, 24498.3] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [25, 30336.3] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [25, 32934.2] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 35982.9] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [25, 37712.9] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [24, 15650.4] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [1, 22274.3] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [34, 28297.1] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [23, 31688.2] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [33, 35575.6] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [25, 37895.5] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [19, 37782.0] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [32, 18261.7] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [34, 25219.4] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [34, 29638.0] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [25, 34177.5] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [33, 37186.6] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [33, 38147.5] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [19, 38089.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [27, 1989.71] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [2, 3580.29] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [0, 5465.5] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [16, 7550.5] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [29, 10205.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [21, 12080.8] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [29, 12805.6] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [28, 3579.27] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [5, 6428.87] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [16, 10488.0] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [35, 15480.7] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [29, 20956.3] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [16, 24824.5] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [35, 26187.6] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [13, 6271.09] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [26, 11432.5] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [18, 17802.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [25, 22446.9] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [24, 26876.7] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [24, 28742.0] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 30332.9] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [13, 11508.4] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [26, 17822.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [34, 24313.4] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [29, 30206.6] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [16, 32411.9] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [14, 35082.0] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [1, 36208.8] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [26, 15303.1] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [26, 22145.7] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [34, 28056.6] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [25, 31658.9] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 35095.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 37139.4] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 36528.9] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [26, 18370.1] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [34, 25143.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [26, 29435.9] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [25, 33958.4] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 36889.1] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 37955.9] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [0, 36818.5] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [24, 19499.9] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [26, 26247.8] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [34, 31503.5] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [33, 35406.7] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 37185.2] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 38019.2] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [2, 37061.7] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [4, 2542.4] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [29, 4318.47] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [29, 7062.62] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [21, 10302.3] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [14, 13399.7] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [21, 15871.1] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 16690.9] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [9, 5849.8] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [25, 10163.4] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [16, 16085.5] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [25, 22880.6] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [16, 28819.1] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [5, 32859.3] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [10, 33157.1] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [26, 9037.04] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [26, 14737.9] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [14, 21304.5] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [25, 27288.4] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 31908.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [16, 33568.6] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [21, 34921.6] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [18, 13443.3] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [34, 20121.1] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [20, 25915.8] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [33, 30881.3] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 33258.8] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [25, 35831.5] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [16, 36485.9] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [34, 16655.5] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [26, 23495.5] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [32, 29021.7] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [25, 32596.0] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 35983.6] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [33, 37809.4] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 36173.4] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [32, 19307.1] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [14, 24275.3] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [26, 30362.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [33, 34695.6] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 37399.8] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [19, 38019.9] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 36454.0] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [30, 18500.7] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [13, 24188.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [14, 30466.2] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [25, 35442.2] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 37396.6] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [14, 38168.5] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [29, 36101.8] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH.yaml +new file mode 100644 +index 00000000..86ccb1b4 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH.yaml +@@ -0,0 +1,15423 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [8, 36.7457] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [20, 68.6421] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [29, 111.931] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [39, 175.083] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 247.116] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 310.505] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [2, 340.267] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [40, 65.5114] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [21, 121.913] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [3, 220.312] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [39, 349.234] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [6, 491.856] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [24, 618.149] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [33, 679.307] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [21, 151.99] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [6, 279.582] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [6, 479.567] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [2, 745.52] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 1027.01] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 1260.26] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [43, 1410.03] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [12, 305.795] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [20, 562.013] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [39, 962.548] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [2, 1496.89] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [2, 2078.96] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [24, 2569.05] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [24, 2891.68] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [20, 622.3] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [48, 1147.71] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [20, 1956.06] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [31, 3043.76] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [51, 4295.79] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [41, 5258.89] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [11, 5969.6] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [20, 1227.66] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [39, 2253.48] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [37, 3857.27] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [19, 6011.17] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [4, 8396.99] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [51, 10595.4] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [39, 12061.8] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [26, 2226.57] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [27, 3942.02] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [9, 6505.95] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [4, 9536.56] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [33, 12933.5] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 15681.0] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 16253.5] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [7, 73.833] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [20, 135.633] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [0, 233.509] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [20, 344.955] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 486.267] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 611.525] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [15, 666.807] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [21, 178.603] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [7, 331.723] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [2, 547.557] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [6, 824.35] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [43, 1096.84] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 1317.2] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [43, 1409.91] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [47, 368.437] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [3, 662.189] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [39, 1096.26] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [43, 1651.95] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 2200.28] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 2639.9] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [6, 2887.08] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [7, 798.611] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [23, 1415.08] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [43, 2341.22] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [20, 3489.44] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [43, 4564.91] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 5405.67] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [33, 5917.36] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [52, 1583.05] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [7, 2834.47] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [29, 4702.13] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [29, 6981.04] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 9150.99] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [20, 10891.2] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [43, 12325.1] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [44, 2882.68] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [40, 5170.97] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [39, 8593.78] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [51, 12941.6] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [20, 17751.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 21555.8] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [15, 24389.9] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [9, 4599.63] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [45, 8503.38] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [18, 13859.7] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [29, 20533.5] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [15, 27450.2] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 32243.5] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [46, 32669.2] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [21, 159.99] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [2, 296.543] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [20, 485.675] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [6, 717.832] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [43, 1018.84] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [24, 1259.41] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [6, 1335.98] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [52, 376.508] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [7, 693.847] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [20, 1141.15] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [6, 1696.21] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [6, 2236.21] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 2657.77] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [43, 2900.5] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [16, 795.427] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [5, 1415.8] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [43, 2329.51] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [2, 3449.25] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [33, 4417.96] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 5296.04] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [6, 5854.77] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [21, 1523.81] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [25, 2709.06] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [39, 4698.83] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [24, 6919.13] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [51, 9062.03] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [24, 10725.4] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [6, 11877.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [47, 2951.14] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [25, 5266.74] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [20, 8731.31] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [43, 13153.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [24, 17491.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [15, 21173.3] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [15, 23730.1] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [40, 4922.17] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [42, 8645.79] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [24, 13915.8] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [39, 20629.8] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [0, 27165.4] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 31676.6] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [24, 33904.5] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [17, 8928.77] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [25, 14966.3] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [7, 21681.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [6, 28146.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [6, 32918.6] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [43, 34272.5] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [53, 34792.5] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [11, 255.667] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [2, 470.847] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [20, 765.847] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [48, 1123.98] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [48, 1548.09] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [15, 1901.1] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [33, 2091.7] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [49, 547.464] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [40, 1046.31] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [3, 1728.42] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [24, 2559.32] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [51, 3348.52] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 4000.6] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [24, 4340.61] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [21, 1145.78] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [44, 2036.72] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [2, 3352.31] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [43, 5011.6] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [6, 6638.73] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [43, 7912.21] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [43, 8765.9] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [21, 2307.94] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [7, 4084.68] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [0, 6471.84] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [6, 9722.14] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 12959.5] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 15626.1] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [43, 17407.9] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [40, 3699.75] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [53, 6651.44] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [4, 10926.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [6, 15931.5] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [6, 20872.6] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [9, 24281.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [43, 25732.7] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [49, 6064.77] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [33, 11356.4] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [37, 18214.6] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [51, 24514.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [43, 29736.9] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [6, 31446.8] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [51, 33687.0] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [52, 10806.6] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [44, 16615.2] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [44, 22539.9] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [44, 27574.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 29728.7] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 32596.5] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [51, 32614.7] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [30, 499.877] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [12, 923.313] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [13, 1497.78] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [24, 2229.43] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 3067.31] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [43, 3766.63] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [19, 4000.92] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [38, 1097.98] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [38, 2014.55] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [6, 3333.22] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [33, 5123.86] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 6740.98] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [6, 7982.49] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [6, 8686.1] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [49, 2159.42] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [7, 3877.02] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [43, 6413.31] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [2, 9668.01] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 12891.5] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 15527.9] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 17518.4] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [21, 3868.69] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [39, 6782.33] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [2, 10931.0] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [15, 15464.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [4, 20500.6] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 23934.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [28, 26178.2] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [21, 6325.44] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [22, 11844.1] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [22, 17949.9] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [24, 24447.6] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 28847.5] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [43, 30748.0] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 33788.1] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [40, 11795.5] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [16, 17724.0] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [44, 24016.0] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [43, 29851.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 32529.0] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [51, 35945.9] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 37562.2] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [21, 15401.3] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [25, 21978.9] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [44, 28088.8] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [24, 31423.5] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 35456.1] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [51, 37685.0] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 36951.0] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [3, 1000.39] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [0, 1845.0] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [6, 2970.46] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [43, 4435.67] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [4, 6060.39] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 7662.28] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 8411.21] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [49, 2077.76] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [3, 3829.23] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [25, 6363.82] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [43, 9573.28] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [6, 12858.5] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 15697.3] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [29, 17128.0] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [40, 3840.94] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [49, 6720.72] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [46, 10829.8] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [27, 15680.9] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [18, 20705.1] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 24112.4] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [14, 25905.7] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [21, 6168.85] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [16, 11549.2] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [33, 18740.3] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [24, 24995.5] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [43, 29613.4] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 31179.0] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [51, 33259.6] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [49, 11812.2] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [44, 18093.5] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [44, 24682.9] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [7, 29987.2] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [43, 32734.0] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [51, 35922.7] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [1, 37829.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [40, 15348.5] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [44, 22023.3] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [44, 28212.8] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [51, 31322.6] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [51, 35312.3] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 37702.9] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 37668.8] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [40, 17719.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [44, 24513.9] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [52, 29207.9] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [43, 33728.3] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [51, 36909.3] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [51, 37902.8] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 37593.2] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [11, 1867.46] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [36, 3002.71] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [9, 4961.21] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [53, 7396.27] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [53, 9896.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [27, 11820.8] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [28, 13089.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [34, 3649.87] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [30, 6573.29] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [9, 10606.2] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [9, 15256.6] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [27, 20513.2] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [36, 24127.5] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [5, 26725.0] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [35, 6018.37] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [40, 11286.4] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [44, 17635.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [25, 22995.6] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 27061.8] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [40, 29256.7] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [30, 30967.9] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [21, 11781.8] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [52, 18002.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [25, 24743.6] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [25, 30190.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 32462.9] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [10, 35969.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [14, 37823.8] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [40, 14991.9] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [52, 21712.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [44, 27782.6] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [25, 31159.7] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 34853.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [14, 37719.5] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [14, 36563.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [40, 17801.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [52, 24795.4] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [44, 29186.2] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [25, 33497.5] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 36596.1] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [14, 37875.8] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 36978.7] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [40, 18774.3] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [52, 25527.6] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [52, 30859.6] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [51, 34833.4] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [51, 36900.4] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 37626.6] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [15, 36225.5] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [26, 2086.46] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [46, 3837.85] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [27, 6367.65] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [18, 9467.95] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [36, 12858.6] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [53, 15583.1] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 16485.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [52, 5803.26] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [15, 10105.2] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [7, 15846.2] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [15, 22586.0] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [24, 28587.3] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 33047.3] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [36, 33095.7] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [40, 9209.4] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [34, 14999.7] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [34, 21230.3] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [44, 27352.3] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 31965.1] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 34747.8] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [9, 34730.1] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [49, 12811.9] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [52, 19768.4] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [25, 26186.3] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [44, 31344.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 33708.2] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [14, 36885.0] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [27, 35926.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [40, 16272.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [44, 22970.7] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [32, 28494.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [44, 32270.7] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [43, 35694.7] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [14, 38373.3] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [43, 35849.0] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [40, 18733.1] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [44, 24530.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [44, 29917.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [43, 34241.9] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [51, 37115.0] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [24, 37885.1] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [27, 36307.3] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [38, 17609.0] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [28, 23294.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [25, 30255.6] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [43, 34915.7] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [51, 37121.9] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [33, 38048.4] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [46, 35769.5] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH_GB.yaml +new file mode 100644 +index 00000000..28a2691a +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_HHS_BH_GB.yaml +@@ -0,0 +1,15423 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 4 ++ GlobalLoadVectorWidthB: 4 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 4 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25600 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 4608 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 4 ++ DestDataType: 4 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 2 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 2 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [8, 36.7457] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [20, 68.6421] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [29, 111.931] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [39, 175.083] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 247.116] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [2, 310.505] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [2, 340.267] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [40, 65.5114] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [21, 121.913] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [3, 220.312] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [39, 349.234] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [6, 491.856] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [24, 618.149] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [33, 679.307] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [21, 151.99] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [6, 279.582] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [6, 479.567] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [2, 745.52] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [39, 1027.01] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [39, 1260.26] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [43, 1410.03] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [12, 305.795] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [20, 562.013] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [39, 962.548] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [2, 1496.89] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [2, 2078.96] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [24, 2569.05] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [24, 2891.68] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [20, 622.3] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [48, 1147.71] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [20, 1956.06] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [31, 3043.76] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [51, 4295.79] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [41, 5258.89] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [11, 5969.6] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [20, 1227.66] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [39, 2253.48] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [37, 3857.27] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [19, 6011.17] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [4, 8396.99] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [51, 10595.4] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [39, 12061.8] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [26, 2226.57] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [27, 3942.02] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [9, 6505.95] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [4, 9536.56] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [33, 12933.5] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [33, 15681.0] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [46, 16253.5] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [7, 73.833] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [20, 135.633] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [0, 233.509] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [20, 344.955] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 486.267] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 611.525] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [15, 666.807] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [21, 178.603] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [7, 331.723] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [2, 547.557] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [6, 824.35] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [43, 1096.84] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [51, 1317.2] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [43, 1409.91] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [47, 368.437] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [3, 662.189] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [39, 1096.26] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [43, 1651.95] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 2200.28] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 2639.9] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [6, 2887.08] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [7, 798.611] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [23, 1415.08] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [43, 2341.22] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [20, 3489.44] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [43, 4564.91] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 5405.67] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [33, 5917.36] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [52, 1583.05] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [7, 2834.47] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [29, 4702.13] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [29, 6981.04] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [51, 9150.99] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [20, 10891.2] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [43, 12325.1] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [44, 2882.68] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [40, 5170.97] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [39, 8593.78] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [51, 12941.6] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [20, 17751.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [2, 21555.8] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [15, 24389.9] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [9, 4599.63] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [45, 8503.38] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [18, 13859.7] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [29, 20533.5] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [15, 27450.2] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [6, 32243.5] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [46, 32669.2] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [21, 159.99] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [2, 296.543] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [20, 485.675] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [6, 717.832] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [43, 1018.84] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [24, 1259.41] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [6, 1335.98] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [52, 376.508] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [7, 693.847] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [20, 1141.15] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [6, 1696.21] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [6, 2236.21] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 2657.77] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [43, 2900.5] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [16, 795.427] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [5, 1415.8] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [43, 2329.51] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [2, 3449.25] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [33, 4417.96] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 5296.04] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [6, 5854.77] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [21, 1523.81] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [25, 2709.06] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [39, 4698.83] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [24, 6919.13] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [51, 9062.03] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [24, 10725.4] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [6, 11877.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [47, 2951.14] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [25, 5266.74] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [20, 8731.31] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [43, 13153.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [24, 17491.0] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [15, 21173.3] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [15, 23730.1] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [40, 4922.17] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [42, 8645.79] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [24, 13915.8] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [39, 20629.8] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [0, 27165.4] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [6, 31676.6] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [24, 33904.5] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [17, 8928.77] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [25, 14966.3] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [7, 21681.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [6, 28146.7] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [6, 32918.6] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [43, 34272.5] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [53, 34792.5] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [11, 255.667] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [2, 470.847] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [20, 765.847] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [48, 1123.98] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [48, 1548.09] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [15, 1901.1] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [33, 2091.7] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [49, 547.464] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [40, 1046.31] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [3, 1728.42] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [24, 2559.32] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [51, 3348.52] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 4000.6] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [24, 4340.61] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [21, 1145.78] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [44, 2036.72] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [2, 3352.31] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [43, 5011.6] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [6, 6638.73] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [43, 7912.21] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [43, 8765.9] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [21, 2307.94] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [7, 4084.68] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [0, 6471.84] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [6, 9722.14] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [24, 12959.5] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 15626.1] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [43, 17407.9] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [40, 3699.75] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [53, 6651.44] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [4, 10926.2] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [6, 15931.5] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [6, 20872.6] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [9, 24281.0] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [43, 25732.7] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [49, 6064.77] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [33, 11356.4] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [37, 18214.6] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [51, 24514.6] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [43, 29736.9] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [6, 31446.8] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [51, 33687.0] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [52, 10806.6] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [44, 16615.2] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [44, 22539.9] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [44, 27574.2] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 29728.7] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [50, 32596.5] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [51, 32614.7] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [30, 499.877] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [12, 923.313] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [13, 1497.78] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [24, 2229.43] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 3067.31] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [43, 3766.63] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [19, 4000.92] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [38, 1097.98] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [38, 2014.55] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [6, 3333.22] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [33, 5123.86] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 6740.98] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [6, 7982.49] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [6, 8686.1] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [49, 2159.42] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [7, 3877.02] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [43, 6413.31] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [2, 9668.01] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 12891.5] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [22, 15527.9] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [24, 17518.4] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [21, 3868.69] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [39, 6782.33] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [2, 10931.0] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [15, 15464.0] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [4, 20500.6] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [18, 23934.6] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [28, 26178.2] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [21, 6325.44] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [22, 11844.1] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [22, 17949.9] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [24, 24447.6] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [6, 28847.5] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [43, 30748.0] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 33788.1] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [40, 11795.5] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [16, 17724.0] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [44, 24016.0] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [43, 29851.6] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 32529.0] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [51, 35945.9] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 37562.2] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [21, 15401.3] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [25, 21978.9] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [44, 28088.8] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [24, 31423.5] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [43, 35456.1] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [51, 37685.0] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [51, 36951.0] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [3, 1000.39] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [0, 1845.0] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [6, 2970.46] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [43, 4435.67] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [4, 6060.39] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [24, 7662.28] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 8411.21] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [49, 2077.76] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [3, 3829.23] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [25, 6363.82] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [43, 9573.28] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [6, 12858.5] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 15697.3] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [29, 17128.0] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [40, 3840.94] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [49, 6720.72] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [46, 10829.8] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [27, 15680.9] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [18, 20705.1] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [27, 24112.4] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [14, 25905.7] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [21, 6168.85] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [16, 11549.2] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [33, 18740.3] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [24, 24995.5] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [43, 29613.4] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 31179.0] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [51, 33259.6] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [49, 11812.2] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [44, 18093.5] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [44, 24682.9] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [7, 29987.2] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [43, 32734.0] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [51, 35922.7] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [1, 37829.3] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [40, 15348.5] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [44, 22023.3] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [44, 28212.8] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [51, 31322.6] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [51, 35312.3] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [43, 37702.9] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 37668.8] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [40, 17719.3] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [44, 24513.9] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [52, 29207.9] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [43, 33728.3] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [51, 36909.3] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [51, 37902.8] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [24, 37593.2] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [11, 1867.46] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [36, 3002.71] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [9, 4961.21] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [53, 7396.27] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [53, 9896.1] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [27, 11820.8] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [28, 13089.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [34, 3649.87] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [30, 6573.29] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [9, 10606.2] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [9, 15256.6] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [27, 20513.2] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [36, 24127.5] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [5, 26725.0] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [35, 6018.37] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [40, 11286.4] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [44, 17635.4] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [25, 22995.6] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 27061.8] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [40, 29256.7] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [30, 30967.9] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [21, 11781.8] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [52, 18002.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [25, 24743.6] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [25, 30190.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [25, 32462.9] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [10, 35969.2] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [14, 37823.8] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [40, 14991.9] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [52, 21712.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [44, 27782.6] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [25, 31159.7] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [44, 34853.0] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [14, 37719.5] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [14, 36563.5] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [40, 17801.5] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [52, 24795.4] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [44, 29186.2] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [25, 33497.5] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 36596.1] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [14, 37875.8] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 36978.7] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [40, 18774.3] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [52, 25527.6] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [52, 30859.6] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [51, 34833.4] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [51, 36900.4] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [33, 37626.6] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [15, 36225.5] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [26, 2086.46] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [46, 3837.85] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [27, 6367.65] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [18, 9467.95] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [36, 12858.6] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [53, 15583.1] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [18, 16485.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [52, 5803.26] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [15, 10105.2] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [7, 15846.2] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [15, 22586.0] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [24, 28587.3] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 33047.3] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [36, 33095.7] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [40, 9209.4] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [34, 14999.7] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [34, 21230.3] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [44, 27352.3] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 31965.1] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [1, 34747.8] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [9, 34730.1] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [49, 12811.9] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [52, 19768.4] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [25, 26186.3] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [44, 31344.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [25, 33708.2] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [14, 36885.0] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [27, 35926.8] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [40, 16272.8] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [44, 22970.7] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [32, 28494.8] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [44, 32270.7] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [43, 35694.7] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [14, 38373.3] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [43, 35849.0] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [40, 18733.1] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [44, 24530.9] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [44, 29917.6] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [43, 34241.9] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [51, 37115.0] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [24, 37885.1] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [27, 36307.3] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [38, 17609.0] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [28, 23294.6] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [25, 30255.6] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [43, 34915.7] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [51, 37121.9] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [33, 38048.4] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [46, 35769.5] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH.yaml +new file mode 100644 +index 00000000..076ef2ea +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH.yaml +@@ -0,0 +1,22173 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 10240 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 38.5506] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [30, 68.5794] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [5, 118.685] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [55, 191.102] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 266.865] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [28, 339.675] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [14, 395.96] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [21, 68.7323] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [55, 130.794] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [21, 228.722] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [55, 380.85] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [37, 535.74] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [30, 685.317] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [37, 796.167] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [68, 161.892] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [7, 302.054] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [66, 523.176] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [30, 818.804] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [4, 1112.33] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [78, 1401.66] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [63, 1624.99] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [68, 345.665] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [29, 642.313] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [66, 1102.17] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [20, 1707.43] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 2277.35] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [48, 2887.53] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [36, 3258.11] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [68, 672.056] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [20, 1313.18] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [27, 2234.88] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [68, 3440.08] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [4, 4682.47] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [13, 5773.56] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [13, 6533.51] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [27, 1302.98] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [54, 2319.22] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [43, 3986.52] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [27, 6463.34] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [6, 8828.95] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [40, 11193.2] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [40, 13014.1] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [13, 2082.83] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [62, 3882.72] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [6, 6471.46] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [76, 10176.5] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 14302.8] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [35, 17831.5] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [13, 19352.3] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [7, 68.5075] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [55, 135.143] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [7, 237.503] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [30, 368.407] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [78, 530.59] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [14, 686.833] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [13, 797.246] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [7, 199.578] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [69, 364.722] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [55, 620.643] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [5, 938.956] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [25, 1223.72] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [63, 1501.72] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 1674.46] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [41, 393.316] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [55, 772.289] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [30, 1279.92] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [30, 1908.45] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [63, 2447.09] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [37, 3029.07] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [36, 3350.0] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [69, 843.246] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [30, 1536.95] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [4, 2548.96] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [4, 3685.68] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [77, 4908.13] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [25, 6048.58] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 6708.37] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [30, 1525.77] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [19, 2786.45] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [76, 4559.65] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [21, 7148.4] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 9529.14] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 11622.1] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [14, 13149.3] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [70, 2594.68] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [66, 4902.79] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [18, 8348.95] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [5, 12993.0] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [67, 17579.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [40, 22628.9] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [40, 26191.4] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [62, 5000.66] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [1, 8895.69] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [24, 14269.4] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [76, 22345.5] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [35, 31809.0] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [36, 37614.4] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 39328.4] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [7, 163.279] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [6, 288.072] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [54, 520.256] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [6, 817.286] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [43, 1104.86] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [25, 1420.35] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [25, 1625.35] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [66, 414.293] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [4, 762.462] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [76, 1241.1] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [6, 1902.18] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [13, 2502.76] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [36, 2975.08] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [13, 3369.51] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [29, 835.19] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [68, 1447.32] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [29, 2425.5] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [68, 3803.49] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 4981.37] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [52, 6034.98] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 6712.74] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [54, 1519.68] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [29, 2781.83] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [40, 4657.1] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [29, 7127.13] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [28, 9430.05] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [29, 11697.6] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [37, 13136.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [54, 2661.78] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [43, 4934.48] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [1, 8247.37] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [5, 12712.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [27, 18102.1] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [76, 22394.9] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [68, 25998.0] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [7, 4817.58] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [15, 8664.8] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [76, 14918.1] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [13, 22453.8] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [75, 31701.9] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [13, 37712.2] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [36, 39075.8] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [40, 9556.98] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [15, 16241.3] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [11, 23107.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [31, 31195.3] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [63, 36860.4] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [76, 39337.8] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 42022.3] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [6, 251.256] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [6, 446.838] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [68, 796.894] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [20, 1252.28] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [5, 1676.72] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 2141.59] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [25, 2446.61] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [53, 630.28] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [21, 1154.18] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [5, 1919.88] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [69, 2759.72] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 3711.23] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [14, 4546.27] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [14, 5029.4] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [30, 1199.51] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [7, 2184.91] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [55, 3522.66] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [76, 5420.18] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 7347.16] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [67, 8923.26] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [63, 9954.85] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [68, 2011.66] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [55, 3736.57] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [69, 6603.47] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [28, 10171.1] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [19, 13639.1] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [37, 17136.4] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [37, 19473.0] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [42, 3976.92] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [61, 6604.34] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [78, 11686.1] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [63, 18004.6] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [37, 23721.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [37, 28165.5] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [63, 29953.8] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [67, 7821.55] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [64, 13411.1] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [72, 20886.7] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [12, 28102.5] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [60, 35289.5] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [35, 37949.5] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [47, 41000.3] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [50, 11739.2] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [64, 19569.1] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [39, 27020.1] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [54, 33590.9] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [30, 36268.6] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [7, 39724.1] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [7, 41645.4] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [20, 511.584] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [6, 964.947] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [68, 1653.91] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [5, 2522.39] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [76, 3431.16] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [21, 4281.19] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [76, 4884.68] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [52, 1167.9] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [29, 2157.94] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [76, 3510.37] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [20, 5463.11] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [40, 7291.27] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [78, 8823.54] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 9937.89] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [54, 2087.41] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [27, 3835.68] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [66, 6526.41] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [52, 10080.4] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [66, 13536.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [28, 17018.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [36, 19391.0] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [0, 3637.21] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [75, 7153.45] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [62, 11398.9] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [24, 17105.1] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [36, 23418.3] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [37, 27902.0] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 29851.1] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [52, 6690.37] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [64, 13391.4] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [71, 20878.0] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [10, 27938.8] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [37, 35078.9] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [36, 37988.9] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 40917.0] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [73, 12139.8] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [51, 20062.4] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [33, 27213.7] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [54, 33451.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [35, 36754.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [60, 40713.7] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [12, 43086.7] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [73, 17071.7] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [26, 25695.8] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [58, 32861.6] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [32, 36276.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [9, 40014.3] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [62, 42772.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [75, 43035.5] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [3, 893.549] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [68, 1823.35] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [66, 3156.78] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [4, 4894.17] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [66, 6763.21] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [13, 8453.07] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [13, 9663.84] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [68, 2033.77] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [4, 3649.35] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [66, 6246.97] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [52, 10036.2] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [4, 13651.1] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [48, 16991.8] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [52, 19534.0] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [76, 4064.25] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [48, 6538.3] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [48, 11985.2] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [24, 17247.2] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [48, 23529.1] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [62, 28109.4] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [36, 29912.6] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [27, 8165.42] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [65, 13421.8] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [71, 20221.7] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [36, 28599.6] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [62, 35306.6] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [60, 38042.7] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [36, 40905.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [45, 11680.6] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [64, 19427.4] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [73, 27391.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [74, 33371.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [60, 36429.4] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [35, 40697.3] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [12, 43090.2] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [45, 16626.2] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [51, 25877.5] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [74, 32844.2] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [57, 36321.7] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [44, 40125.9] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [62, 42773.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [75, 43092.5] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [73, 21607.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [45, 30644.5] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [34, 34998.2] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [32, 39209.6] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [36, 42262.3] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [9, 42637.6] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [29, 43208.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [6, 1803.23] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [62, 2871.5] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [36, 5087.1] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [13, 7863.11] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [62, 10897.3] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [62, 13628.0] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [36, 14815.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [38, 4018.17] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [36, 6185.55] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [14, 10363.8] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [48, 17710.0] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [62, 23113.1] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [25, 27916.5] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 29818.4] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [34, 7449.92] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [64, 13829.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [31, 20772.5] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [37, 27921.3] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [13, 34935.9] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [75, 37956.7] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [62, 40886.8] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [73, 11637.4] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [50, 19974.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [34, 27012.8] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [34, 33600.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [35, 36905.2] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [61, 40723.0] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [47, 42817.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [45, 16557.8] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [45, 25375.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [34, 33008.5] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [32, 36453.3] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [8, 40048.1] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [36, 42814.4] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [75, 43281.0] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [73, 22138.4] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [73, 30708.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [46, 35121.0] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [44, 39435.7] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [36, 42320.8] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [69, 42714.1] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [27, 42866.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [19, 6510.9] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [16, 13148.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [22, 24930.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [11, 39133.0] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 41732.5] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [34, 42575.9] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [29, 43103.9] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [24, 1910.63] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [76, 3657.97] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [13, 6304.29] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [24, 9697.13] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [24, 13918.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [36, 17590.2] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [62, 19365.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [4, 5850.82] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [2, 10205.2] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [36, 16625.5] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [13, 24310.4] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [62, 31786.4] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [13, 37607.9] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [36, 39160.5] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [54, 9831.36] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [17, 16168.9] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [33, 23303.7] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [56, 31410.7] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [60, 36991.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [37, 39511.5] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 41894.4] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [73, 13413.7] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [64, 22455.7] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [10, 29743.5] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [32, 36082.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [71, 38464.0] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [62, 41781.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 43408.5] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [73, 18740.3] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [73, 27851.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [74, 34575.6] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [44, 37737.4] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [36, 41071.9] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [36, 43425.9] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [63, 43288.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [58, 22742.0] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [33, 31786.3] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [58, 36015.3] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [59, 39381.7] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 41619.2] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [27, 41878.8] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [32, 43314.6] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [19, 6780.39] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [16, 13826.8] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [11, 26586.0] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [11, 39749.7] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 41006.2] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [31, 42968.4] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [63, 43313.6] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH_GB.yaml +new file mode 100644 +index 00000000..7a5d42f9 +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_I8II_BH_GB.yaml +@@ -0,0 +1,22173 @@ ++- {MinimumRequiredVersion: 4.35.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 1 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 4 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 0 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 0 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 1 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 1 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 2 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 10240 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 1 ++ LoopTail: true ++ LoopUnroll: 16 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 2 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 0 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 8 ++ GlobalLoadVectorWidthB: 8 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 8 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 4 ++ LVCB: 4 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 64 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 2 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 25856 ++ LdsNumElementsAlignedA: 4352 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4352 ++ LdsOffsetB_Blk: 20736 ++ LdsPadA: 8 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++ - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 3 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 32 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: true ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 16 ++ GlobalLoadVectorWidthB: 16 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 16 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 2 ++ LVCB: 2 ++ LVPA: 1 ++ LVPB: 1 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 128 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsInitCVgprs: false ++ LdsNumElements: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsPadA: 32 ++ LdsPadB: 32 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 16 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 2 ++ LoopTail: true ++ LoopUnroll: 32 ++ MACInstruction: FMA ++ MFMA_BF16_1K: false ++ MIArchVgpr: true ++ MIBlock: [16, 16, 16, 1, 1, 1] ++ MIInputPerThread: 16 ++ MIOutputVectorWidth: 1 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 16 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 16, 1] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 128 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: 1 ++ PrefetchLocalRead: 1 ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 6 ++ ConvolutionConfig: [] ++ DataType: 8 ++ DestDataType: 6 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: true ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: false ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 ++ SourceSwap: 1 ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 4 ++ SubGroup1: 32 ++ SubGroupA: 4 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadSeparateGlobalReadA: 0 ++ ThreadSeparateGlobalReadB: 0 ++ ThreadTile: [4, 64] ++ ThreadTile0: 32 ++ ThreadTile1: 4 ++ ThreadTileA: 32 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 4 ++ WaveSeparateGlobalReadA: 1 ++ WaveSeparateGlobalReadB: 1 ++ WavefrontSize: 32 ++ WorkGroup: [32, 4, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 32 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWBforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [64, 64, 1, 64, 96, 96, 96, 96] ++ - [2, 38.5506] ++ - - [64, 64, 1, 128, 96, 96, 160, 160] ++ - [30, 68.5794] ++ - - [64, 64, 1, 256, 96, 96, 288, 288] ++ - [5, 118.685] ++ - - [64, 64, 1, 512, 96, 96, 544, 544] ++ - [55, 191.102] ++ - - [64, 64, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 266.865] ++ - - [64, 64, 1, 2048, 96, 96, 2080, 2080] ++ - [28, 339.675] ++ - - [64, 64, 1, 4096, 96, 96, 4128, 4128] ++ - [14, 395.96] ++ - - [64, 128, 1, 64, 96, 96, 96, 96] ++ - [21, 68.7323] ++ - - [64, 128, 1, 128, 96, 96, 160, 160] ++ - [55, 130.794] ++ - - [64, 128, 1, 256, 96, 96, 288, 288] ++ - [21, 228.722] ++ - - [64, 128, 1, 512, 96, 96, 544, 544] ++ - [55, 380.85] ++ - - [64, 128, 1, 1024, 96, 96, 1056, 1056] ++ - [37, 535.74] ++ - - [64, 128, 1, 2048, 96, 96, 2080, 2080] ++ - [30, 685.317] ++ - - [64, 128, 1, 4096, 96, 96, 4128, 4128] ++ - [37, 796.167] ++ - - [64, 256, 1, 64, 96, 96, 96, 96] ++ - [68, 161.892] ++ - - [64, 256, 1, 128, 96, 96, 160, 160] ++ - [7, 302.054] ++ - - [64, 256, 1, 256, 96, 96, 288, 288] ++ - [66, 523.176] ++ - - [64, 256, 1, 512, 96, 96, 544, 544] ++ - [30, 818.804] ++ - - [64, 256, 1, 1024, 96, 96, 1056, 1056] ++ - [4, 1112.33] ++ - - [64, 256, 1, 2048, 96, 96, 2080, 2080] ++ - [78, 1401.66] ++ - - [64, 256, 1, 4096, 96, 96, 4128, 4128] ++ - [63, 1624.99] ++ - - [64, 512, 1, 64, 96, 96, 96, 96] ++ - [68, 345.665] ++ - - [64, 512, 1, 128, 96, 96, 160, 160] ++ - [29, 642.313] ++ - - [64, 512, 1, 256, 96, 96, 288, 288] ++ - [66, 1102.17] ++ - - [64, 512, 1, 512, 96, 96, 544, 544] ++ - [20, 1707.43] ++ - - [64, 512, 1, 1024, 96, 96, 1056, 1056] ++ - [24, 2277.35] ++ - - [64, 512, 1, 2048, 96, 96, 2080, 2080] ++ - [48, 2887.53] ++ - - [64, 512, 1, 4096, 96, 96, 4128, 4128] ++ - [36, 3258.11] ++ - - [64, 1024, 1, 64, 96, 96, 96, 96] ++ - [68, 672.056] ++ - - [64, 1024, 1, 128, 96, 96, 160, 160] ++ - [20, 1313.18] ++ - - [64, 1024, 1, 256, 96, 96, 288, 288] ++ - [27, 2234.88] ++ - - [64, 1024, 1, 512, 96, 96, 544, 544] ++ - [68, 3440.08] ++ - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] ++ - [4, 4682.47] ++ - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] ++ - [13, 5773.56] ++ - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] ++ - [13, 6533.51] ++ - - [64, 2048, 1, 64, 96, 96, 96, 96] ++ - [27, 1302.98] ++ - - [64, 2048, 1, 128, 96, 96, 160, 160] ++ - [54, 2319.22] ++ - - [64, 2048, 1, 256, 96, 96, 288, 288] ++ - [43, 3986.52] ++ - - [64, 2048, 1, 512, 96, 96, 544, 544] ++ - [27, 6463.34] ++ - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] ++ - [6, 8828.95] ++ - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] ++ - [40, 11193.2] ++ - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] ++ - [40, 13014.1] ++ - - [64, 4096, 1, 64, 96, 96, 96, 96] ++ - [13, 2082.83] ++ - - [64, 4096, 1, 128, 96, 96, 160, 160] ++ - [62, 3882.72] ++ - - [64, 4096, 1, 256, 96, 96, 288, 288] ++ - [6, 6471.46] ++ - - [64, 4096, 1, 512, 96, 96, 544, 544] ++ - [76, 10176.5] ++ - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] ++ - [23, 14302.8] ++ - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] ++ - [35, 17831.5] ++ - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] ++ - [13, 19352.3] ++ - - [128, 64, 1, 64, 160, 160, 96, 96] ++ - [7, 68.5075] ++ - - [128, 64, 1, 128, 160, 160, 160, 160] ++ - [55, 135.143] ++ - - [128, 64, 1, 256, 160, 160, 288, 288] ++ - [7, 237.503] ++ - - [128, 64, 1, 512, 160, 160, 544, 544] ++ - [30, 368.407] ++ - - [128, 64, 1, 1024, 160, 160, 1056, 1056] ++ - [78, 530.59] ++ - - [128, 64, 1, 2048, 160, 160, 2080, 2080] ++ - [14, 686.833] ++ - - [128, 64, 1, 4096, 160, 160, 4128, 4128] ++ - [13, 797.246] ++ - - [128, 128, 1, 64, 160, 160, 96, 96] ++ - [7, 199.578] ++ - - [128, 128, 1, 128, 160, 160, 160, 160] ++ - [69, 364.722] ++ - - [128, 128, 1, 256, 160, 160, 288, 288] ++ - [55, 620.643] ++ - - [128, 128, 1, 512, 160, 160, 544, 544] ++ - [5, 938.956] ++ - - [128, 128, 1, 1024, 160, 160, 1056, 1056] ++ - [25, 1223.72] ++ - - [128, 128, 1, 2048, 160, 160, 2080, 2080] ++ - [63, 1501.72] ++ - - [128, 128, 1, 4096, 160, 160, 4128, 4128] ++ - [49, 1674.46] ++ - - [128, 256, 1, 64, 160, 160, 96, 96] ++ - [41, 393.316] ++ - - [128, 256, 1, 128, 160, 160, 160, 160] ++ - [55, 772.289] ++ - - [128, 256, 1, 256, 160, 160, 288, 288] ++ - [30, 1279.92] ++ - - [128, 256, 1, 512, 160, 160, 544, 544] ++ - [30, 1908.45] ++ - - [128, 256, 1, 1024, 160, 160, 1056, 1056] ++ - [63, 2447.09] ++ - - [128, 256, 1, 2048, 160, 160, 2080, 2080] ++ - [37, 3029.07] ++ - - [128, 256, 1, 4096, 160, 160, 4128, 4128] ++ - [36, 3350.0] ++ - - [128, 512, 1, 64, 160, 160, 96, 96] ++ - [69, 843.246] ++ - - [128, 512, 1, 128, 160, 160, 160, 160] ++ - [30, 1536.95] ++ - - [128, 512, 1, 256, 160, 160, 288, 288] ++ - [4, 2548.96] ++ - - [128, 512, 1, 512, 160, 160, 544, 544] ++ - [4, 3685.68] ++ - - [128, 512, 1, 1024, 160, 160, 1056, 1056] ++ - [77, 4908.13] ++ - - [128, 512, 1, 2048, 160, 160, 2080, 2080] ++ - [25, 6048.58] ++ - - [128, 512, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 6708.37] ++ - - [128, 1024, 1, 64, 160, 160, 96, 96] ++ - [30, 1525.77] ++ - - [128, 1024, 1, 128, 160, 160, 160, 160] ++ - [19, 2786.45] ++ - - [128, 1024, 1, 256, 160, 160, 288, 288] ++ - [76, 4559.65] ++ - - [128, 1024, 1, 512, 160, 160, 544, 544] ++ - [21, 7148.4] ++ - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] ++ - [24, 9529.14] ++ - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] ++ - [76, 11622.1] ++ - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] ++ - [14, 13149.3] ++ - - [128, 2048, 1, 64, 160, 160, 96, 96] ++ - [70, 2594.68] ++ - - [128, 2048, 1, 128, 160, 160, 160, 160] ++ - [66, 4902.79] ++ - - [128, 2048, 1, 256, 160, 160, 288, 288] ++ - [18, 8348.95] ++ - - [128, 2048, 1, 512, 160, 160, 544, 544] ++ - [5, 12993.0] ++ - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] ++ - [67, 17579.3] ++ - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] ++ - [40, 22628.9] ++ - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] ++ - [40, 26191.4] ++ - - [128, 4096, 1, 64, 160, 160, 96, 96] ++ - [62, 5000.66] ++ - - [128, 4096, 1, 128, 160, 160, 160, 160] ++ - [1, 8895.69] ++ - - [128, 4096, 1, 256, 160, 160, 288, 288] ++ - [24, 14269.4] ++ - - [128, 4096, 1, 512, 160, 160, 544, 544] ++ - [76, 22345.5] ++ - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] ++ - [35, 31809.0] ++ - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] ++ - [36, 37614.4] ++ - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] ++ - [24, 39328.4] ++ - - [256, 64, 1, 64, 288, 288, 96, 96] ++ - [7, 163.279] ++ - - [256, 64, 1, 128, 288, 288, 160, 160] ++ - [6, 288.072] ++ - - [256, 64, 1, 256, 288, 288, 288, 288] ++ - [54, 520.256] ++ - - [256, 64, 1, 512, 288, 288, 544, 544] ++ - [6, 817.286] ++ - - [256, 64, 1, 1024, 288, 288, 1056, 1056] ++ - [43, 1104.86] ++ - - [256, 64, 1, 2048, 288, 288, 2080, 2080] ++ - [25, 1420.35] ++ - - [256, 64, 1, 4096, 288, 288, 4128, 4128] ++ - [25, 1625.35] ++ - - [256, 128, 1, 64, 288, 288, 96, 96] ++ - [66, 414.293] ++ - - [256, 128, 1, 128, 288, 288, 160, 160] ++ - [4, 762.462] ++ - - [256, 128, 1, 256, 288, 288, 288, 288] ++ - [76, 1241.1] ++ - - [256, 128, 1, 512, 288, 288, 544, 544] ++ - [6, 1902.18] ++ - - [256, 128, 1, 1024, 288, 288, 1056, 1056] ++ - [13, 2502.76] ++ - - [256, 128, 1, 2048, 288, 288, 2080, 2080] ++ - [36, 2975.08] ++ - - [256, 128, 1, 4096, 288, 288, 4128, 4128] ++ - [13, 3369.51] ++ - - [256, 256, 1, 64, 288, 288, 96, 96] ++ - [29, 835.19] ++ - - [256, 256, 1, 128, 288, 288, 160, 160] ++ - [68, 1447.32] ++ - - [256, 256, 1, 256, 288, 288, 288, 288] ++ - [29, 2425.5] ++ - - [256, 256, 1, 512, 288, 288, 544, 544] ++ - [68, 3803.49] ++ - - [256, 256, 1, 1024, 288, 288, 1056, 1056] ++ - [29, 4981.37] ++ - - [256, 256, 1, 2048, 288, 288, 2080, 2080] ++ - [52, 6034.98] ++ - - [256, 256, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 6712.74] ++ - - [256, 512, 1, 64, 288, 288, 96, 96] ++ - [54, 1519.68] ++ - - [256, 512, 1, 128, 288, 288, 160, 160] ++ - [29, 2781.83] ++ - - [256, 512, 1, 256, 288, 288, 288, 288] ++ - [40, 4657.1] ++ - - [256, 512, 1, 512, 288, 288, 544, 544] ++ - [29, 7127.13] ++ - - [256, 512, 1, 1024, 288, 288, 1056, 1056] ++ - [28, 9430.05] ++ - - [256, 512, 1, 2048, 288, 288, 2080, 2080] ++ - [29, 11697.6] ++ - - [256, 512, 1, 4096, 288, 288, 4128, 4128] ++ - [37, 13136.7] ++ - - [256, 1024, 1, 64, 288, 288, 96, 96] ++ - [54, 2661.78] ++ - - [256, 1024, 1, 128, 288, 288, 160, 160] ++ - [43, 4934.48] ++ - - [256, 1024, 1, 256, 288, 288, 288, 288] ++ - [1, 8247.37] ++ - - [256, 1024, 1, 512, 288, 288, 544, 544] ++ - [5, 12712.4] ++ - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] ++ - [27, 18102.1] ++ - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] ++ - [76, 22394.9] ++ - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] ++ - [68, 25998.0] ++ - - [256, 2048, 1, 64, 288, 288, 96, 96] ++ - [7, 4817.58] ++ - - [256, 2048, 1, 128, 288, 288, 160, 160] ++ - [15, 8664.8] ++ - - [256, 2048, 1, 256, 288, 288, 288, 288] ++ - [76, 14918.1] ++ - - [256, 2048, 1, 512, 288, 288, 544, 544] ++ - [13, 22453.8] ++ - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] ++ - [75, 31701.9] ++ - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] ++ - [13, 37712.2] ++ - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] ++ - [36, 39075.8] ++ - - [256, 4096, 1, 64, 288, 288, 96, 96] ++ - [40, 9556.98] ++ - - [256, 4096, 1, 128, 288, 288, 160, 160] ++ - [15, 16241.3] ++ - - [256, 4096, 1, 256, 288, 288, 288, 288] ++ - [11, 23107.2] ++ - - [256, 4096, 1, 512, 288, 288, 544, 544] ++ - [31, 31195.3] ++ - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] ++ - [63, 36860.4] ++ - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] ++ - [76, 39337.8] ++ - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] ++ - [48, 42022.3] ++ - - [384, 64, 1, 64, 416, 416, 96, 96] ++ - [6, 251.256] ++ - - [384, 64, 1, 128, 416, 416, 160, 160] ++ - [6, 446.838] ++ - - [384, 64, 1, 256, 416, 416, 288, 288] ++ - [68, 796.894] ++ - - [384, 64, 1, 512, 416, 416, 544, 544] ++ - [20, 1252.28] ++ - - [384, 64, 1, 1024, 416, 416, 1056, 1056] ++ - [5, 1676.72] ++ - - [384, 64, 1, 2048, 416, 416, 2080, 2080] ++ - [24, 2141.59] ++ - - [384, 64, 1, 4096, 416, 416, 4128, 4128] ++ - [25, 2446.61] ++ - - [384, 128, 1, 64, 416, 416, 96, 96] ++ - [53, 630.28] ++ - - [384, 128, 1, 128, 416, 416, 160, 160] ++ - [21, 1154.18] ++ - - [384, 128, 1, 256, 416, 416, 288, 288] ++ - [5, 1919.88] ++ - - [384, 128, 1, 512, 416, 416, 544, 544] ++ - [69, 2759.72] ++ - - [384, 128, 1, 1024, 416, 416, 1056, 1056] ++ - [63, 3711.23] ++ - - [384, 128, 1, 2048, 416, 416, 2080, 2080] ++ - [14, 4546.27] ++ - - [384, 128, 1, 4096, 416, 416, 4128, 4128] ++ - [14, 5029.4] ++ - - [384, 256, 1, 64, 416, 416, 96, 96] ++ - [30, 1199.51] ++ - - [384, 256, 1, 128, 416, 416, 160, 160] ++ - [7, 2184.91] ++ - - [384, 256, 1, 256, 416, 416, 288, 288] ++ - [55, 3522.66] ++ - - [384, 256, 1, 512, 416, 416, 544, 544] ++ - [76, 5420.18] ++ - - [384, 256, 1, 1024, 416, 416, 1056, 1056] ++ - [25, 7347.16] ++ - - [384, 256, 1, 2048, 416, 416, 2080, 2080] ++ - [67, 8923.26] ++ - - [384, 256, 1, 4096, 416, 416, 4128, 4128] ++ - [63, 9954.85] ++ - - [384, 512, 1, 64, 416, 416, 96, 96] ++ - [68, 2011.66] ++ - - [384, 512, 1, 128, 416, 416, 160, 160] ++ - [55, 3736.57] ++ - - [384, 512, 1, 256, 416, 416, 288, 288] ++ - [69, 6603.47] ++ - - [384, 512, 1, 512, 416, 416, 544, 544] ++ - [28, 10171.1] ++ - - [384, 512, 1, 1024, 416, 416, 1056, 1056] ++ - [19, 13639.1] ++ - - [384, 512, 1, 2048, 416, 416, 2080, 2080] ++ - [37, 17136.4] ++ - - [384, 512, 1, 4096, 416, 416, 4128, 4128] ++ - [37, 19473.0] ++ - - [384, 1024, 1, 64, 416, 416, 96, 96] ++ - [42, 3976.92] ++ - - [384, 1024, 1, 128, 416, 416, 160, 160] ++ - [61, 6604.34] ++ - - [384, 1024, 1, 256, 416, 416, 288, 288] ++ - [78, 11686.1] ++ - - [384, 1024, 1, 512, 416, 416, 544, 544] ++ - [63, 18004.6] ++ - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] ++ - [37, 23721.8] ++ - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] ++ - [37, 28165.5] ++ - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] ++ - [63, 29953.8] ++ - - [384, 2048, 1, 64, 416, 416, 96, 96] ++ - [67, 7821.55] ++ - - [384, 2048, 1, 128, 416, 416, 160, 160] ++ - [64, 13411.1] ++ - - [384, 2048, 1, 256, 416, 416, 288, 288] ++ - [72, 20886.7] ++ - - [384, 2048, 1, 512, 416, 416, 544, 544] ++ - [12, 28102.5] ++ - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] ++ - [60, 35289.5] ++ - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] ++ - [35, 37949.5] ++ - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] ++ - [47, 41000.3] ++ - - [384, 4096, 1, 64, 416, 416, 96, 96] ++ - [50, 11739.2] ++ - - [384, 4096, 1, 128, 416, 416, 160, 160] ++ - [64, 19569.1] ++ - - [384, 4096, 1, 256, 416, 416, 288, 288] ++ - [39, 27020.1] ++ - - [384, 4096, 1, 512, 416, 416, 544, 544] ++ - [54, 33590.9] ++ - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] ++ - [30, 36268.6] ++ - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] ++ - [7, 39724.1] ++ - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] ++ - [7, 41645.4] ++ - - [768, 64, 1, 64, 800, 800, 96, 96] ++ - [20, 511.584] ++ - - [768, 64, 1, 128, 800, 800, 160, 160] ++ - [6, 964.947] ++ - - [768, 64, 1, 256, 800, 800, 288, 288] ++ - [68, 1653.91] ++ - - [768, 64, 1, 512, 800, 800, 544, 544] ++ - [5, 2522.39] ++ - - [768, 64, 1, 1024, 800, 800, 1056, 1056] ++ - [76, 3431.16] ++ - - [768, 64, 1, 2048, 800, 800, 2080, 2080] ++ - [21, 4281.19] ++ - - [768, 64, 1, 4096, 800, 800, 4128, 4128] ++ - [76, 4884.68] ++ - - [768, 128, 1, 64, 800, 800, 96, 96] ++ - [52, 1167.9] ++ - - [768, 128, 1, 128, 800, 800, 160, 160] ++ - [29, 2157.94] ++ - - [768, 128, 1, 256, 800, 800, 288, 288] ++ - [76, 3510.37] ++ - - [768, 128, 1, 512, 800, 800, 544, 544] ++ - [20, 5463.11] ++ - - [768, 128, 1, 1024, 800, 800, 1056, 1056] ++ - [40, 7291.27] ++ - - [768, 128, 1, 2048, 800, 800, 2080, 2080] ++ - [78, 8823.54] ++ - - [768, 128, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 9937.89] ++ - - [768, 256, 1, 64, 800, 800, 96, 96] ++ - [54, 2087.41] ++ - - [768, 256, 1, 128, 800, 800, 160, 160] ++ - [27, 3835.68] ++ - - [768, 256, 1, 256, 800, 800, 288, 288] ++ - [66, 6526.41] ++ - - [768, 256, 1, 512, 800, 800, 544, 544] ++ - [52, 10080.4] ++ - - [768, 256, 1, 1024, 800, 800, 1056, 1056] ++ - [66, 13536.4] ++ - - [768, 256, 1, 2048, 800, 800, 2080, 2080] ++ - [28, 17018.3] ++ - - [768, 256, 1, 4096, 800, 800, 4128, 4128] ++ - [36, 19391.0] ++ - - [768, 512, 1, 64, 800, 800, 96, 96] ++ - [0, 3637.21] ++ - - [768, 512, 1, 128, 800, 800, 160, 160] ++ - [75, 7153.45] ++ - - [768, 512, 1, 256, 800, 800, 288, 288] ++ - [62, 11398.9] ++ - - [768, 512, 1, 512, 800, 800, 544, 544] ++ - [24, 17105.1] ++ - - [768, 512, 1, 1024, 800, 800, 1056, 1056] ++ - [36, 23418.3] ++ - - [768, 512, 1, 2048, 800, 800, 2080, 2080] ++ - [37, 27902.0] ++ - - [768, 512, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 29851.1] ++ - - [768, 1024, 1, 64, 800, 800, 96, 96] ++ - [52, 6690.37] ++ - - [768, 1024, 1, 128, 800, 800, 160, 160] ++ - [64, 13391.4] ++ - - [768, 1024, 1, 256, 800, 800, 288, 288] ++ - [71, 20878.0] ++ - - [768, 1024, 1, 512, 800, 800, 544, 544] ++ - [10, 27938.8] ++ - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] ++ - [37, 35078.9] ++ - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] ++ - [36, 37988.9] ++ - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] ++ - [62, 40917.0] ++ - - [768, 2048, 1, 64, 800, 800, 96, 96] ++ - [73, 12139.8] ++ - - [768, 2048, 1, 128, 800, 800, 160, 160] ++ - [51, 20062.4] ++ - - [768, 2048, 1, 256, 800, 800, 288, 288] ++ - [33, 27213.7] ++ - - [768, 2048, 1, 512, 800, 800, 544, 544] ++ - [54, 33451.3] ++ - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] ++ - [35, 36754.4] ++ - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] ++ - [60, 40713.7] ++ - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] ++ - [12, 43086.7] ++ - - [768, 4096, 1, 64, 800, 800, 96, 96] ++ - [73, 17071.7] ++ - - [768, 4096, 1, 128, 800, 800, 160, 160] ++ - [26, 25695.8] ++ - - [768, 4096, 1, 256, 800, 800, 288, 288] ++ - [58, 32861.6] ++ - - [768, 4096, 1, 512, 800, 800, 544, 544] ++ - [32, 36276.7] ++ - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] ++ - [9, 40014.3] ++ - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] ++ - [62, 42772.1] ++ - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] ++ - [75, 43035.5] ++ - - [1536, 64, 1, 64, 1568, 1568, 96, 96] ++ - [3, 893.549] ++ - - [1536, 64, 1, 128, 1568, 1568, 160, 160] ++ - [68, 1823.35] ++ - - [1536, 64, 1, 256, 1568, 1568, 288, 288] ++ - [66, 3156.78] ++ - - [1536, 64, 1, 512, 1568, 1568, 544, 544] ++ - [4, 4894.17] ++ - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] ++ - [66, 6763.21] ++ - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] ++ - [13, 8453.07] ++ - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] ++ - [13, 9663.84] ++ - - [1536, 128, 1, 64, 1568, 1568, 96, 96] ++ - [68, 2033.77] ++ - - [1536, 128, 1, 128, 1568, 1568, 160, 160] ++ - [4, 3649.35] ++ - - [1536, 128, 1, 256, 1568, 1568, 288, 288] ++ - [66, 6246.97] ++ - - [1536, 128, 1, 512, 1568, 1568, 544, 544] ++ - [52, 10036.2] ++ - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] ++ - [4, 13651.1] ++ - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] ++ - [48, 16991.8] ++ - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] ++ - [52, 19534.0] ++ - - [1536, 256, 1, 64, 1568, 1568, 96, 96] ++ - [76, 4064.25] ++ - - [1536, 256, 1, 128, 1568, 1568, 160, 160] ++ - [48, 6538.3] ++ - - [1536, 256, 1, 256, 1568, 1568, 288, 288] ++ - [48, 11985.2] ++ - - [1536, 256, 1, 512, 1568, 1568, 544, 544] ++ - [24, 17247.2] ++ - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] ++ - [48, 23529.1] ++ - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] ++ - [62, 28109.4] ++ - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] ++ - [36, 29912.6] ++ - - [1536, 512, 1, 64, 1568, 1568, 96, 96] ++ - [27, 8165.42] ++ - - [1536, 512, 1, 128, 1568, 1568, 160, 160] ++ - [65, 13421.8] ++ - - [1536, 512, 1, 256, 1568, 1568, 288, 288] ++ - [71, 20221.7] ++ - - [1536, 512, 1, 512, 1568, 1568, 544, 544] ++ - [36, 28599.6] ++ - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] ++ - [62, 35306.6] ++ - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] ++ - [60, 38042.7] ++ - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] ++ - [36, 40905.5] ++ - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] ++ - [45, 11680.6] ++ - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] ++ - [64, 19427.4] ++ - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] ++ - [73, 27391.4] ++ - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] ++ - [74, 33371.0] ++ - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] ++ - [60, 36429.4] ++ - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] ++ - [35, 40697.3] ++ - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] ++ - [12, 43090.2] ++ - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] ++ - [45, 16626.2] ++ - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] ++ - [51, 25877.5] ++ - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] ++ - [74, 32844.2] ++ - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] ++ - [57, 36321.7] ++ - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] ++ - [44, 40125.9] ++ - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] ++ - [62, 42773.2] ++ - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] ++ - [75, 43092.5] ++ - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] ++ - [73, 21607.4] ++ - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] ++ - [45, 30644.5] ++ - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] ++ - [34, 34998.2] ++ - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] ++ - [32, 39209.6] ++ - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] ++ - [36, 42262.3] ++ - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] ++ - [9, 42637.6] ++ - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] ++ - [29, 43208.6] ++ - - [3072, 64, 1, 64, 3104, 3104, 96, 96] ++ - [6, 1803.23] ++ - - [3072, 64, 1, 128, 3104, 3104, 160, 160] ++ - [62, 2871.5] ++ - - [3072, 64, 1, 256, 3104, 3104, 288, 288] ++ - [36, 5087.1] ++ - - [3072, 64, 1, 512, 3104, 3104, 544, 544] ++ - [13, 7863.11] ++ - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] ++ - [62, 10897.3] ++ - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] ++ - [62, 13628.0] ++ - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] ++ - [36, 14815.7] ++ - - [3072, 128, 1, 64, 3104, 3104, 96, 96] ++ - [38, 4018.17] ++ - - [3072, 128, 1, 128, 3104, 3104, 160, 160] ++ - [36, 6185.55] ++ - - [3072, 128, 1, 256, 3104, 3104, 288, 288] ++ - [14, 10363.8] ++ - - [3072, 128, 1, 512, 3104, 3104, 544, 544] ++ - [48, 17710.0] ++ - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] ++ - [62, 23113.1] ++ - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] ++ - [25, 27916.5] ++ - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] ++ - [13, 29818.4] ++ - - [3072, 256, 1, 64, 3104, 3104, 96, 96] ++ - [34, 7449.92] ++ - - [3072, 256, 1, 128, 3104, 3104, 160, 160] ++ - [64, 13829.3] ++ - - [3072, 256, 1, 256, 3104, 3104, 288, 288] ++ - [31, 20772.5] ++ - - [3072, 256, 1, 512, 3104, 3104, 544, 544] ++ - [37, 27921.3] ++ - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] ++ - [13, 34935.9] ++ - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] ++ - [75, 37956.7] ++ - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] ++ - [62, 40886.8] ++ - - [3072, 512, 1, 64, 3104, 3104, 96, 96] ++ - [73, 11637.4] ++ - - [3072, 512, 1, 128, 3104, 3104, 160, 160] ++ - [50, 19974.9] ++ - - [3072, 512, 1, 256, 3104, 3104, 288, 288] ++ - [34, 27012.8] ++ - - [3072, 512, 1, 512, 3104, 3104, 544, 544] ++ - [34, 33600.7] ++ - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] ++ - [35, 36905.2] ++ - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] ++ - [61, 40723.0] ++ - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] ++ - [47, 42817.9] ++ - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] ++ - [45, 16557.8] ++ - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] ++ - [45, 25375.2] ++ - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] ++ - [34, 33008.5] ++ - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] ++ - [32, 36453.3] ++ - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] ++ - [8, 40048.1] ++ - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] ++ - [36, 42814.4] ++ - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] ++ - [75, 43281.0] ++ - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] ++ - [73, 22138.4] ++ - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] ++ - [73, 30708.8] ++ - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] ++ - [46, 35121.0] ++ - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] ++ - [44, 39435.7] ++ - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] ++ - [36, 42320.8] ++ - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] ++ - [69, 42714.1] ++ - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] ++ - [27, 42866.9] ++ - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] ++ - [19, 6510.9] ++ - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] ++ - [16, 13148.5] ++ - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] ++ - [22, 24930.8] ++ - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] ++ - [11, 39133.0] ++ - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] ++ - [33, 41732.5] ++ - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] ++ - [34, 42575.9] ++ - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] ++ - [29, 43103.9] ++ - - [4096, 64, 1, 64, 4128, 4128, 96, 96] ++ - [24, 1910.63] ++ - - [4096, 64, 1, 128, 4128, 4128, 160, 160] ++ - [76, 3657.97] ++ - - [4096, 64, 1, 256, 4128, 4128, 288, 288] ++ - [13, 6304.29] ++ - - [4096, 64, 1, 512, 4128, 4128, 544, 544] ++ - [24, 9697.13] ++ - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] ++ - [24, 13918.0] ++ - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] ++ - [36, 17590.2] ++ - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] ++ - [62, 19365.6] ++ - - [4096, 128, 1, 64, 4128, 4128, 96, 96] ++ - [4, 5850.82] ++ - - [4096, 128, 1, 128, 4128, 4128, 160, 160] ++ - [2, 10205.2] ++ - - [4096, 128, 1, 256, 4128, 4128, 288, 288] ++ - [36, 16625.5] ++ - - [4096, 128, 1, 512, 4128, 4128, 544, 544] ++ - [13, 24310.4] ++ - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] ++ - [62, 31786.4] ++ - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] ++ - [13, 37607.9] ++ - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] ++ - [36, 39160.5] ++ - - [4096, 256, 1, 64, 4128, 4128, 96, 96] ++ - [54, 9831.36] ++ - - [4096, 256, 1, 128, 4128, 4128, 160, 160] ++ - [17, 16168.9] ++ - - [4096, 256, 1, 256, 4128, 4128, 288, 288] ++ - [33, 23303.7] ++ - - [4096, 256, 1, 512, 4128, 4128, 544, 544] ++ - [56, 31410.7] ++ - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] ++ - [60, 36991.2] ++ - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] ++ - [37, 39511.5] ++ - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 41894.4] ++ - - [4096, 512, 1, 64, 4128, 4128, 96, 96] ++ - [73, 13413.7] ++ - - [4096, 512, 1, 128, 4128, 4128, 160, 160] ++ - [64, 22455.7] ++ - - [4096, 512, 1, 256, 4128, 4128, 288, 288] ++ - [10, 29743.5] ++ - - [4096, 512, 1, 512, 4128, 4128, 544, 544] ++ - [32, 36082.6] ++ - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] ++ - [71, 38464.0] ++ - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] ++ - [62, 41781.9] ++ - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] ++ - [76, 43408.5] ++ - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] ++ - [73, 18740.3] ++ - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] ++ - [73, 27851.8] ++ - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] ++ - [74, 34575.6] ++ - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] ++ - [44, 37737.4] ++ - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] ++ - [36, 41071.9] ++ - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] ++ - [36, 43425.9] ++ - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] ++ - [63, 43288.9] ++ - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] ++ - [58, 22742.0] ++ - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] ++ - [33, 31786.3] ++ - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] ++ - [58, 36015.3] ++ - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] ++ - [59, 39381.7] ++ - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 41619.2] ++ - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] ++ - [27, 41878.8] ++ - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] ++ - [32, 43314.6] ++ - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] ++ - [19, 6780.39] ++ - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] ++ - [16, 13826.8] ++ - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] ++ - [11, 26586.0] ++ - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] ++ - [11, 39749.7] ++ - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] ++ - [33, 41006.2] ++ - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] ++ - [31, 42968.4] ++ - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] ++ - [63, 43313.6] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_SB.yaml +new file mode 100644 +index 00000000..ba2f617e +--- /dev/null ++++ b/library/src/blas3/Tensile/Logic/asm_full/phoenix/phoenix_Cijk_Alik_Bljk_SB.yaml +@@ -0,0 +1,310 @@ ++- {MinimumRequiredVersion: 4.33.0} ++- phoenix ++- gfx1103 ++- [Device 1586] ++- AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++- - 1LDSBuffer: 0 ++ AggressivePerfMode: 1 ++ AssertAlphaValue: false ++ AssertBetaValue: false ++ AssertCEqualsD: false ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertMinApproxSize: 0 ++ AssertSizeEqual: {} ++ AssertSizeGreaterThan: {} ++ AssertSizeLessThan: {} ++ AssertSizeMultiple: {} ++ AssertStrideAEqual: {0: 1} ++ AssertStrideBEqual: {0: 1} ++ AssertStrideCEqual: {0: 1} ++ AssertStrideDEqual: {0: 1} ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ AtomicAddC: false ++ BufferLoad: true ++ BufferStore: true ++ CheckDimOverflow: 0 ++ CheckTensorDimAsserts: false ++ CodeObjectVersion: default ++ CustomKernelName: '' ++ DepthU: 8 ++ DepthULdsDivisor: 1 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DisableAtomicFail: 0 ++ DisableKernelPieces: 0 ++ DisableVgprOverlapping: false ++ EdgeType: ShiftPtr ++ EnableMatrixInstruction: false ++ ExpandPointerSwap: 0 ++ Fp16AltImpl: false ++ FractionalLoad: 0 ++ GlobalLoadVectorWidthA: 1 ++ GlobalLoadVectorWidthB: 1 ++ GlobalRead2A: true ++ GlobalRead2B: true ++ GlobalReadCoalesceGroupA: true ++ GlobalReadCoalesceGroupB: true ++ GlobalReadCoalesceVectorA: true ++ GlobalReadCoalesceVectorB: true ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidth: 1 ++ GlobalSplitU: 1 ++ GlobalSplitUAlgorithm: SingleBuffer ++ GlobalSplitUSummationAssignmentRoundRobin: true ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ ISA: [11, 0, 3] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ KernelLanguage: Assembly ++ LSCA: 8 ++ LSCB: 8 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 32 ++ LVPB: 32 ++ LdcEqualsLdd: false ++ LdsBlockSizePerPad: 0 ++ LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadB: 0 ++ LdsInitCVgprs: false ++ LdsNumElements: 512 ++ LdsOffsetA: 0 ++ LdsOffsetB: 256 ++ LdsPadA: 0 ++ LdsPadB: 0 ++ LocalDotLayout: 1 ++ LocalRead2A: true ++ LocalRead2B: true ++ LocalReadVectorWidth: 1 ++ LocalSplitU: 1 ++ LocalWrite2A: true ++ LocalWrite2B: true ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopDoWhile: false ++ LoopIters: 8 ++ LoopTail: true ++ LoopUnroll: 8 ++ MACInstruction: FMA ++ MIArchVgpr: false ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 ++ MacroTileShapeMax: 64 ++ MacroTileShapeMin: 1 ++ MagicDivAlg: 2 ++ MatrixInstruction: [] ++ MaxOccupancy: 40 ++ MaxVgprNumber: 256 ++ MinVgprNumber: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ OptNoLoadLoop: 1 ++ OptPreLoopVmcnt: 0 ++ PackBatchDims: 0 ++ PackFreeDims: 1 ++ PackGranularity: 2 ++ PackSummationDims: 0 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PerformanceSyncLocation: -1 ++ PerformanceWaitCount: -1 ++ PerformanceWaitLocation: -1 ++ PersistentKernel: 0 ++ PersistentKernelAlongBatch: false ++ PrefetchAcrossPersistent: 0 ++ PrefetchAcrossPersistentMode: 0 ++ PrefetchGlobalRead: false ++ PrefetchLocalRead: true ++ ProblemType: ++ AllowNoFreeDims: false ++ AssignedDerivedParameters: true ++ Batched: true ++ ComplexConjugateA: false ++ ComplexConjugateB: false ++ ComputeDataType: 0 ++ ConvolutionConfig: [] ++ DataType: 0 ++ DestDataType: 0 ++ Fp16AltImpl: false ++ HighPrecisionAccumulate: false ++ Index0: 0 ++ Index01A: 0 ++ Index01B: 1 ++ Index1: 1 ++ IndexAssignmentsA: [3, 0, 2] ++ IndexAssignmentsB: [3, 1, 2] ++ IndexAssignmentsLD: [4, 5, 6, 7] ++ IndexUnroll: 3 ++ IndexUnrollA: 0 ++ IndexUnrollB: 0 ++ IndicesBatch: [2] ++ IndicesFree: [0, 1] ++ IndicesSummation: [3] ++ MirrorDimsA: [] ++ MirrorDimsB: [] ++ NumIndicesBatch: 1 ++ NumIndicesC: 3 ++ NumIndicesFree: 2 ++ NumIndicesLD: 4 ++ NumIndicesSummation: 1 ++ OperationType: GEMM ++ SetConstStrideA: [] ++ SetConstStrideB: [] ++ SilentHighPrecisionAccumulate: false ++ StridedBatched: true ++ TLUA: false ++ TLUB: false ++ Tensor0: 0 ++ Tensor1: 1 ++ TileA: 0 ++ TileAwareSelection: false ++ TileB: 1 ++ TotalIndices: 4 ++ TransposeA: true ++ TransposeB: false ++ UseBeta: true ++ UseInitialStridesAB: false ++ UseInitialStridesCD: false ++ ZeroPadA: [] ++ ZeroPadB: [] ++ ReplacementKernel: false ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 1 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 0 ++ SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_ ++ SourceSwap: false ++ StaggerU: 32 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StoreCInUnroll: false ++ StoreCInUnrollExact: false ++ StoreCInUnrollInterval: 1 ++ StoreCInUnrollPostLoop: false ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [2, 2] ++ ThreadTile0: 2 ++ ThreadTile1: 2 ++ ThreadTileA: 2 ++ ThreadTileB: 2 ++ TransposeLDS: 0 ++ UnrollIncIsDepthU: 0 ++ UnrollMajorLDSA: 0 ++ UnrollMajorLDSB: 0 ++ UnrollMemFence: false ++ Use64bShadowLimit: 1 ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: -1 ++ Valid: true ++ VectorAtomicWidth: 1 ++ VectorStore: -1 ++ VectorWidth: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 8 ++ WorkGroupMappingType: B ++ _DepthULds: 8 ++ _GlobalAccumulation: null ++ _UseSgprForGRO: 1 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemC: 0 ++ _staggerStrideShift: 3 ++ allowLRVWforTLUandMI: false ++- [2, 3, 0, 1] ++- - - [126, 126, 2, 66, 126, 126, 66, 66] ++ - [0, 0] ++- null ++- null ++- DeviceEfficiency +diff --git a/library/src/handle.cpp b/library/src/handle.cpp +index 4c4bd320..728f3c80 100644 +--- a/library/src/handle.cpp ++++ b/library/src/handle.cpp +@@ -141,6 +141,10 @@ static Processor getActiveArch(int deviceId) + { + return Processor::gfx1102; + } ++ else if(deviceString.find("gfx1103") != std::string::npos) ++ { ++ return Processor::gfx1103; ++ } + return static_cast(0); + } + +diff --git a/library/src/include/handle.hpp b/library/src/include/handle.hpp +index 282edb8f..a7611d4a 100644 +--- a/library/src/include/handle.hpp ++++ b/library/src/include/handle.hpp +@@ -92,7 +92,8 @@ enum class Processor : int + gfx1035 = 1035, + gfx1100 = 1100, + gfx1101 = 1101, +- gfx1102 = 1102 ++ gfx1102 = 1102, ++ gfx1103 = 1103 + }; + + // helper function in handle.cpp +diff --git a/library/src/tensile_host.cpp b/library/src/tensile_host.cpp +index 53bebba1..cd8089fd 100644 +--- a/library/src/tensile_host.cpp ++++ b/library/src/tensile_host.cpp +@@ -306,6 +306,10 @@ namespace + { + return Tensile::LazyLoadingInit::gfx1102; + } ++ else if(deviceString.find("gfx1103") != std::string::npos) ++ { ++ return Tensile::LazyLoadingInit::gfx1103; ++ } + return Tensile::LazyLoadingInit::None; + } + +-- +2.37.3 + diff --git a/0001-fixup-install-of-tensile-output.patch b/0001-fixup-install-of-tensile-output.patch new file mode 100644 index 0000000..f39e5d6 --- /dev/null +++ b/0001-fixup-install-of-tensile-output.patch @@ -0,0 +1,25 @@ +From 3d82251d51a9804c28cb84d598084fc12ca0418f Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 13 Jan 2024 14:36:01 -0500 +Subject: [PATCH] fixup install of tensile output + +--- + library/src/CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt +index 6acedfb2..2877c6e1 100755 +--- a/library/src/CMakeLists.txt ++++ b/library/src/CMakeLists.txt +@@ -591,7 +591,7 @@ if( BUILD_WITH_TENSILE ) + if (WIN32) + set( ROCBLAS_TENSILE_LIBRARY_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/bin/rocblas" CACHE PATH "path to tensile library" ) + else() +- set( ROCBLAS_TENSILE_LIBRARY_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}${CMAKE_INSTALL_LIBDIR}/rocblas" CACHE PATH "path to tensile library" ) ++ set( ROCBLAS_TENSILE_LIBRARY_DIR "${CMAKE_INSTALL_LIBDIR}/rocblas" CACHE PATH "path to tensile library" ) + endif() + # For ASAN package, Tensile library files(which are not shared libraries) are not required + if( NOT ENABLE_ASAN_PACKAGING ) +-- +2.43.0 + diff --git a/0001-offload-compress-option.patch b/0001-offload-compress-option.patch new file mode 100644 index 0000000..4a403d8 --- /dev/null +++ b/0001-offload-compress-option.patch @@ -0,0 +1,44 @@ +From 2966285dc09ca9c7e6b95c5212a2d5bd46ab8376 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 27 Sep 2024 05:40:14 -0700 +Subject: [PATCH] offload compress option + +Try out --offload-compress + +Signed-off-by: Tom Rix +--- + cmake/build-options.cmake | 2 ++ + library/CMakeLists.txt | 4 ++++ + 2 files changed, 6 insertions(+) + +diff --git a/cmake/build-options.cmake b/cmake/build-options.cmake +index cb35e72fb157..26d04c0aefdb 100755 +--- a/cmake/build-options.cmake ++++ b/cmake/build-options.cmake +@@ -36,6 +36,8 @@ option( BUILD_SHARED_LIBS "Build rocBLAS as a shared library" ON ) + # library without tensile to allow for rapid iteration without GEMM functionality + option( BUILD_WITH_TENSILE "Build full functionality which requires tensile?" ON ) + ++option( BUILD_OFFLOAD_COMPRESS "Build with offload compress?" OFF ) ++ + include(clients/cmake/client-build-options.cmake) + + if (WIN32) +diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt +index 90a75dd394d2..0386a3058d45 100755 +--- a/library/CMakeLists.txt ++++ b/library/CMakeLists.txt +@@ -66,6 +66,10 @@ function( rocblas_library_settings lib_target_ ) + # Do not allow Variable Length Arrays (use unique_ptr instead) + target_compile_options( ${lib_target_} PRIVATE -Werror=vla ) + ++ if ( BUILD_OFFLOAD_COMPRESS ) ++ target_compile_options( ${lib_target_} PRIVATE --offload-compress ) ++ endif () ++ + target_compile_definitions( ${lib_target_} PRIVATE ROCM_USE_FLOAT16 ROCBLAS_INTERNAL_API ROCBLAS_BETA_FEATURES_API ) + + # both libraries will use rocblas_EXPORTS +-- +2.46.0 + diff --git a/0001-option-to-disable-roctracer-logging.patch b/0001-option-to-disable-roctracer-logging.patch new file mode 100644 index 0000000..9f344da --- /dev/null +++ b/0001-option-to-disable-roctracer-logging.patch @@ -0,0 +1,82 @@ +diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt +index 762580d7..f02d62e6 100644 +--- a/library/CMakeLists.txt ++++ b/library/CMakeLists.txt +@@ -80,7 +80,7 @@ function( rocblas_library_settings lib_target_ ) + target_link_libraries( ${lib_target_} PRIVATE hip::device ) + else() + target_link_libraries( ${lib_target_} PRIVATE hip::device -lstdc++fs --rtlib=compiler-rt --unwindlib=libgcc) +- if (BUILD_SHARED_LIBS) ++ if (BUILD_SHARED_LIBS AND NOT DISABLE_ROCTRACER) + target_link_libraries(${lib_target_} PRIVATE -lroctx64) + endif() + endif() +diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt +index 5deab888..f96c7e3e 100644 +--- a/library/src/CMakeLists.txt ++++ b/library/src/CMakeLists.txt +@@ -28,6 +28,10 @@ + # package_targets is used as a list of install target + set( package_targets rocblas ) + ++# we want to decrease the number of build deps for EPEL packages so patching in an option ++# to disable its use ++ ++option( DISABLE_ROCTRACER "Disable use of rocTRACER for logging" ON ) + + # Set up Tensile Dependency + if( BUILD_WITH_TENSILE ) +@@ -688,7 +692,12 @@ if(BUILD_WITH_HIPBLASLT) + find_package( hipblaslt ${HIPBLASLT_VERSION} REQUIRED CONFIG PATHS ${hipblaslt_path} ${ROCM_PATH}) + endif() + +-if( NOT BUILD_SHARED_LIBS ) ++if ( DISABLE_ROCTRACER ) ++ message( "roctracer is disabled for this build" ) ++ target_compile_definitions( rocblas PRIVATE DISABLE_ROCTRACER ) ++endif() ++ ++if( NOT DISABLE_ROCTRACER AND NOT BUILD_SHARED_LIBS ) + target_compile_definitions( rocblas PRIVATE ROCBLAS_STATIC_LIB ) + endif() + +diff --git a/library/src/include/logging.hpp b/library/src/include/logging.hpp +index 352e4e39..d222879a 100644 +--- a/library/src/include/logging.hpp ++++ b/library/src/include/logging.hpp +@@ -40,7 +40,7 @@ + #include + #include + +-#if !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) ++#if !defined(DISABLE_ROCTRACER) && !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) + #include + #endif + +@@ -453,7 +453,7 @@ class Logger + public: + Logger() = default; + +-#if !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) ++#if !defined(DISABLE_ROCTRACER) && !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) + void log_range(const std::string& name) + { + if(!m_active) +@@ -472,7 +472,7 @@ public: + // ((os << sep << std::forward(xs)), ...); + (void)(int[]){(os << sep << std::forward(xs), 0)...}; + +-#if !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) ++#if !defined(DISABLE_ROCTRACER) && !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) + log_range(os.str()); + #endif + os << std::endl; +@@ -527,7 +527,7 @@ public: + + ~Logger() + { +-#if !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) ++#if !defined(DISABLE_ROCTRACER) && !defined(ROCBLAS_STATIC_LIB) && !defined(WIN32) + if(m_active) + { + roctxRangePop(); diff --git a/0001-prepare-rocblas-cmake-for-fedora.patch b/0001-prepare-rocblas-cmake-for-fedora.patch new file mode 100644 index 0000000..3af4b54 --- /dev/null +++ b/0001-prepare-rocblas-cmake-for-fedora.patch @@ -0,0 +1,26 @@ +From aba3a118fd32f415e63b4e24555b8df98e89292d Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Tue, 3 Oct 2023 10:37:12 -0700 +Subject: [PATCH] prepare rocblas cmake for fedora + +Signed-off-by: Tom Rix +--- + clients/CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt +index da44cef0..6d39c1f3 100755 +--- a/clients/CMakeLists.txt ++++ b/clients/CMakeLists.txt +@@ -131,7 +131,7 @@ if( BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) + message(STATUS "Linking BLIS LIB: ${BLAS_LIBRARY}") + endif() + else() +- set( BLAS_LIBRARY "blas" ) ++ set( BLAS_LIBRARY "cblas" ) + endif() + else() # WIN32 + set( BLAS_INCLUDE_DIR ${OPENBLAS_DIR}/include CACHE PATH "OpenBLAS library include path" ) +-- +2.41.0 + diff --git a/Modify-CMakeLists.txt-files-to-allow-to-build-modules-independently.patch b/Modify-CMakeLists.txt-files-to-allow-to-build-modules-independently.patch new file mode 100644 index 0000000..638b1f3 --- /dev/null +++ b/Modify-CMakeLists.txt-files-to-allow-to-build-modules-independently.patch @@ -0,0 +1,80 @@ +From: Egbert Eich +Date: Wed Apr 30 16:18:49 2025 +0200 +Subject: Modify CMakeLists.txt files to allow to build modules independently +Patch-mainline: Not yet +Git-commit: f4724507a2770b2ed5ecc633aa406ad70a675e6f +References: + +Signed-off-by: Egbert Eich +Signed-off-by: Egbert Eich +--- + library/src/CMakeLists.txt | 24 +++++------------------- + library/src/TensileInstall/CMakeLists.txt | 19 +++++++++++++++++++ + 2 files changed, 24 insertions(+), 19 deletions(-) +diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt +index 35342e1..efa732c 100644 +--- a/library/src/CMakeLists.txt ++++ b/library/src/CMakeLists.txt +@@ -97,7 +97,9 @@ if( BUILD_WITH_TENSILE ) + set_target_properties( TensileHost PROPERTIES OUTPUT_NAME rocblas-tensile CXX_EXTENSIONS NO ) + + # Tensile host depends on libs build target +- add_dependencies( TensileHost TENSILE_LIBRARY_TARGET ) ++ if(NOT DEFINED ENV{TENSILE_SKIP_LIBRARY} OR NOT $ENV{TENSILE_SKIP_LIBRARY}) ++ add_dependencies( TensileHost TENSILE_LIBRARY_TARGET ) ++ endif() + + if( ROCBLAS_SHARED_LIBS ) + set( BUILD_SHARED_LIBS ON ) +@@ -823,24 +825,8 @@ rocm_install_targets( + ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR} + ) + +-if( BUILD_WITH_TENSILE ) +- if (WIN32) +- set( ROCBLAS_TENSILE_LIBRARY_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/bin/rocblas" CACHE PATH "path to tensile library" ) +- else() +- set( ROCBLAS_TENSILE_LIBRARY_DIR "${CMAKE_INSTALL_LIBDIR}/rocblas" CACHE PATH "path to tensile library" ) +- endif() +- # For ASAN package, Tensile library files(which are not shared libraries) are not required +- if( NOT ENABLE_ASAN_PACKAGING ) +- if( BUILD_SHARED_LIBS ) +- set( TENSILE_DATA_COMPONENT_NAME ${CMAKE_INSTALL_DEFAULT_COMPONENT_NAME} ) +- else() +- set( TENSILE_DATA_COMPONENT_NAME devel ) +- endif() +- rocm_install( +- DIRECTORY ${CMAKE_BINARY_DIR}/Tensile/library +- DESTINATION ${ROCBLAS_TENSILE_LIBRARY_DIR} +- COMPONENT ${TENSILE_DATA_COMPONENT_NAME}) # Use this cmake variable to be compatible with rocm-cmake 0.6 and 0.7 +- endif() ++if(NOT DEFINED ENV{TENSILE_SKIP_LIBRARY} OR NOT $ENV{TENSILE_SKIP_LIBRARY}) ++ add_subdirectory( TensileInstall ) + endif() + + if(NOT WIN32) +diff --git a/library/src/TensileInstall/CMakeLists.txt b/library/src/TensileInstall/CMakeLists.txt +new file mode 100644 +index 0000000..fa39e9f +--- /dev/null ++++ b/library/src/TensileInstall/CMakeLists.txt +@@ -0,0 +1,19 @@ ++if( BUILD_WITH_TENSILE ) ++ if (WIN32) ++ set( ROCBLAS_TENSILE_LIBRARY_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/bin/rocblas" CACHE PATH "path to tensile library" ) ++ else() ++ set( ROCBLAS_TENSILE_LIBRARY_DIR "${CMAKE_INSTALL_LIBDIR}/rocblas" CACHE PATH "path to tensile library" ) ++ endif() ++ # For ASAN package, Tensile library files(which are not shared libraries) are not required ++ if( NOT ENABLE_ASAN_PACKAGING ) ++ if( BUILD_SHARED_LIBS ) ++ set( TENSILE_DATA_COMPONENT_NAME ${CMAKE_INSTALL_DEFAULT_COMPONENT_NAME} ) ++ else() ++ set( TENSILE_DATA_COMPONENT_NAME devel ) ++ endif() ++ rocm_install( ++ DIRECTORY ${CMAKE_BINARY_DIR}/Tensile/library ++ DESTINATION ${ROCBLAS_TENSILE_LIBRARY_DIR} ++ COMPONENT ${TENSILE_DATA_COMPONENT_NAME}) # Use this cmake variable to be compatible with rocm-cmake 0.6 and 0.7 ++ endif() ++endif() diff --git a/_constraints b/_constraints new file mode 100644 index 0000000..eb589de --- /dev/null +++ b/_constraints @@ -0,0 +1,14 @@ + + + + + 60 + + + 16 + + 4 + 4 + + SLOW_CPU + diff --git a/rocBLAS-6.4.0.tar.gz b/rocBLAS-6.4.0.tar.gz new file mode 100644 index 0000000..e9f6262 --- /dev/null +++ b/rocBLAS-6.4.0.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab8e75c9f98d17817a650aa4f06ff1e6c6af92cd143079e361cb6a0c96676aaa +size 57974253 diff --git a/rocblas.rpmlintrc b/rocblas.rpmlintrc new file mode 100644 index 0000000..1e2fff2 --- /dev/null +++ b/rocblas.rpmlintrc @@ -0,0 +1,3 @@ +# librocblas4-arch-.* are no shared libraries but modules loaded onto the GPU. +addFilter("rocblas-arch-.*: W: shared-lib-without-dependency-information") +addFilter("rocblas-arch-.*: E: shlib-fixed-dependency.") diff --git a/rocblas.spec b/rocblas.spec new file mode 100644 index 0000000..7222f2e --- /dev/null +++ b/rocblas.spec @@ -0,0 +1,551 @@ +%if 0%{?suse_version} +%global rocblas_name librocblas4 +%else +%global rocblas_name rocblas +%endif + +%global upstreamname rocBLAS +%global rocm_release 6.4 +%global rocm_patch 0 +%global rocm_version %{rocm_release}.%{rocm_patch} + +%if 0%{?suse_version} +# On SUSE build tensile modules - the .so module is built in a separate package +%bcond_without tensile_package + +%define build_tensile_separately 1 +%else +%define build_tensile_separately 0 +%endif + +%global toolchain rocm +# hipcc does not support some clang flags +%global build_cxxflags %(echo %{optflags} | sed -e 's/-fstack-protector-strong/-Xarch_host -fstack-protector-strong/' -e 's/-fcf-protection/-Xarch_host -fcf-protection/') + +%bcond_with debug +%if %{with debug} +%global build_type DEBUG +%else +%global build_type RELEASE +%endif + +%bcond_without compress +%if %{with compress} +%global build_compress ON +%else +%global build_compress OFF +%endif + +%bcond_with test +%if %{with test} && %{without tensile_package} +%global build_test ON +%global __brp_check_rpaths %{nil} +%else +%global build_test OFF +%endif + +# Option to test suite for testing on real HW: +# May have to set gpu under test with +# export HIP_VISIBLE_DEVICES= - 0, 1 etc. +%bcond_with check + +# Tensile in 6.4 does not support generics +# https://github.com/ROCm/Tensile/issues/2124 +%bcond_without tensile +%if %{with tensile} +%global build_tensile ON +%else +%global build_tensile OFF +%endif + +%if 0%{?rhel} || 0%{?sle_version} > 160000 +%bcond_with msgpack +%else +%bcond_without msgpack +%endif + +# Use ninja if it is available +# Ninja is available on suse but obs times out with ninja build, make doesn't +%if 0%{?fedora} +%bcond_without ninja +%else +%bcond_with ninja +%endif + +%if 0%{?rhel} && 0%{?rhel} < 10 +# On CS9: /usr/bin/debugedit: Cannot handle 8-byte build ID +%global debug_package %{nil} +%endif + +# Compression type and level for source/binary package payloads. +# "w7T0.xzdio" xz level 7 using %%{getncpus} threads +%global _source_payload w7T0.xzdio +%global _binary_payload w7T0.xzdio + +# SUSE/OSB times out because -O is added to the make args +# This accumulates all the output from the long running tensile +# jobs. +%global _make_output_sync %{nil} + +# OracleLinux 9 has a problem with it's strip not recognizing *.co's +%global __strip %rocmllvm_bindir/llvm-strip + +%if %{with ninja} +%global cmake_generator -G Ninja +%else +%global cmake_generator %{nil} +%endif + +%global cmake_config \\\ + -DCMAKE_CXX_COMPILER=hipcc \\\ + -DCMAKE_C_COMPILER=hipcc \\\ + -DCMAKE_LINKER=%rocmllvm_bindir/ld.lld \\\ + -DCMAKE_AR=%rocmllvm_bindir/llvm-ar \\\ + -DCMAKE_RANLIB=%rocmllvm_bindir/llvm-ranlib \\\ + -DCMAKE_BUILD_TYPE=%{build_type} \\\ + -DCMAKE_PREFIX_PATH=%{rocmllvm_cmakedir}/.. \\\ + -DCMAKE_SKIP_RPATH=ON \\\ + -DCMAKE_VERBOSE_MAKEFILE=ON \\\ + -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \\\ + -DROCM_SYMLINK_LIBS=OFF \\\ + -DHIP_PLATFORM=amd \\\ + -DBUILD_CLIENTS_BENCHMARKS=%{build_test} \\\ + -DBUILD_CLIENTS_TESTS=%{build_test} \\\ + -DBUILD_CLIENTS_TESTS_OPENMP=OFF \\\ + -DBUILD_FORTRAN_CLIENTS=OFF \\\ + -DBLAS_LIBRARY=cblas \\\ + -DBUILD_OFFLOAD_COMPRESS=%{build_compress} \\\ + -DBUILD_WITH_HIPBLASLT=OFF \\\ + -DTensile_COMPILER=hipcc \\\ + -DTensile_CPU_THREADS=${CORES} \\\ + -DTensile_LIBRARY_FORMAT=%{tensile_library_format} \\\ + -DTensile_VERBOSE=%{tensile_verbose} \\\ + -DTensile_DIR=${TP}/cmake \\\ + -DDISABLE_ROCTRACER=ON \\\ + -DBUILD_WITH_PIP=OFF + +%bcond_with generic +%global rocm_gpu_list_generic "gfx9-generic;gfx9-4-generic;gfx10-1-generic;gfx10-3-generic;gfx11-generic;gfx12-generic" +%if %{with generic} +%global gpu_list %{rocm_gpu_list_generic} +%else +%global gpu_list %{rocm_gpu_list_default} +%endif + +# gfx950 is an experimental target +# Enabling will short circuit the normal build. +# There is no check support. +# To use do +# $ module load rocm/gfx950 +# +# $ module purge +%bcond_with gfx950 + +%if %{with gfx950} && %{with tensile_package} +ExclusiveArch: do_not_build +%endif + +Name: %{rocblas_name} +Version: %{rocm_version} +Release: 9%{?dist} +Summary: BLAS implementation for ROCm +Url: https://github.com/ROCmSoftwarePlatform/%{upstreamname} +License: MIT AND BSD-3-Clause + +Source0: %{url}/archive/refs/tags/rocm-%{rocm_version}.tar.gz#/%{upstreamname}-%{rocm_version}.tar.gz +Source1: rocblas.rpmlintrc +Patch2: 0001-fixup-install-of-tensile-output.patch +Patch3: Modify-CMakeLists.txt-files-to-allow-to-build-modules-independently.patch +Patch4: 0001-offload-compress-option.patch +Patch6: 0001-option-to-disable-roctracer-logging.patch + +%if 0%{build_tensile_separately} +Requires: rocblas-tensile = %version +%if %{with check} +# If %check is enabled, we need to serialize the builds which will introduce +# a circular dependency. The flag below causes OBS to ignore this. +# +#!BuildIgnore: %name +BuildRequires: rocblas-tensile = %version +%endif +%endif +BuildRequires: cmake +BuildRequires: gcc-c++ +BuildRequires: rocm-cmake +BuildRequires: rocm-comgr-devel +BuildRequires: rocm-compilersupport-macros +BuildRequires: rocm-hip-devel +BuildRequires: rocm-runtime-devel +BuildRequires: rocm-rpm-macros +BuildRequires: rocm-rpm-macros-modules + +%if %{with tensile} +%if 0%{?suse_version} +BuildRequires: %{python_module tensile-devel} +%if %{suse_version} < 1699 +BuildRequires: %{python_module joblib} +%endif # suse_version < 1699 +# OBS vm times out without console output +%global tensile_verbose 2 +%{?with_msgpack:BuildRequires: msgpack-cxx-devel} +%else # ?suse_version +BuildRequires: python3dist(tensile) +%if 0%{?rhel} +%global tensile_verbose 2 +%else +%{?with_msgpack:BuildRequires: msgpack-devel} +%global tensile_verbose 1 +%global tensile_library_format msgpack +%endif +%endif # suse_version +%if %{with msgpack} +%global tensile_library_format msgpack +%else +%global tensile_library_format yaml +%endif +%else +%global tensile_verbose %{nil} +%global tensile_library_format %{nil} +%endif # tensile + +%if %{with compress} +BuildRequires: pkgconfig(libzstd) +%endif + +%if %{with test} +%if 0%{?suse_version} +BuildRequires: %{python_module PyYAML} +%else +BuildRequires: python3dist(pyyaml) +%endif +BuildRequires: blas-devel +BuildRequires: libomp-devel +BuildRequires: rocminfo +BuildRequires: rocm-smi-devel +BuildRequires: roctracer-devel + +%if 0%{?suse_version} +BuildRequires: cblas-devel +BuildRequires: gcc-fortran +BuildRequires: gtest +%else +BuildRequires: gtest-devel +%endif + +%endif + +%if %{with ninja} +%if 0%{?fedora} || 0%{?rhel} +BuildRequires: ninja-build +%endif +%if 0%{?suse_version} +BuildRequires: ninja +%define __builder ninja +%endif +%endif + +Provides: rocblas = %{version}-%{release} + +# Only x86_64 works right now: +ExclusiveArch: x86_64 + +%description +rocBLAS is the AMD library for Basic Linear Algebra Subprograms +(BLAS) on the ROCm platform. It is implemented in the HIP +programming language and optimized for AMD GPUs. + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig + +%package devel +Summary: Libraries and headers for %{name} +Requires: %{name}%{?_isa} = %{version}-%{release} +Requires: cmake(hip) +Provides: rocblas-devel = %{version}-%{release} + +%description devel +%{summary} + +%package -n rocblas-tensile +Summary: ROCBlas Tensile Modules +Requires: %{name} = %version + +%description -n rocblas-tensile +BLAS architecture modules for all AMDGPU architectures + +%if %{with test} +%package test +Summary: Tests for %{name} +Requires: diffutils +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description test +%{summary} +%endif + +%if %{with gfx950} + +%package gfx950 +Summary: The gfx950 rocBLAS package +Provides: rocblas-gfx950 = %{version}-%{release} +Conflicts: %{name} + +%description gfx950 +%{summary} + +%package gfx950-devel +Summary: The gfx950 rocBLAS development package +Requires: %{name}-gfx950%{?_isa} = %{version}-%{release} +Provides: rocblas-gfx950-devel = %{version}-%{release} +Conflicts: %{name}-devel + +%description gfx950-devel +%{summary} + +%if %{with test} && %{without tensile_package} +%package gfx950-test +Summary: The gfx950 rocBLAS test package +Requires: %{name}-gfx950%{?_isa} = %{version}-%{release} +Conflicts: %{name}-test + +%description gfx950-test +%{summary} + +%endif # gfx950-test +%endif # gfx950 + +%prep +%autosetup -p1 -n %{upstreamname}-rocm-%{version} +sed -i -e 's@set( BLAS_LIBRARY "blas" )@set( BLAS_LIBRARY "cblas" )@' clients/CMakeLists.txt +sed -i -e 's@target_link_libraries( rocblas-test PRIVATE ${BLAS_LIBRARY} ${GTEST_BOTH_LIBRARIES} roc::rocblas )@target_link_libraries( rocblas-test PRIVATE cblas ${GTEST_BOTH_LIBRARIES} roc::rocblas )@' clients/gtest/CMakeLists.txt + +# no git in this build +sed -i -e 's@find_package(Git REQUIRED)@find_package(Git)@' library/CMakeLists.txt + +# On Tumbleweed Q2,2025 +# /usr/include/gtest/internal/gtest-port.h:279:2: error: C++ versions less than C++14 are not supported. +# 279 | #error C++ versions less than C++14 are not supported. +# Convert the c++11's to c++14 +sed -i -e 's@CXX_STANDARD 11@CXX_STANDARD 14@' clients/samples/CMakeLists.txt + +%if 0%{?suse_version} +# Suse's libgfortran.so for gcc 14 is here +# /usr/lib64/gcc/x86_64-suse-linux/14/libgfortran.so +# Without adding this path with -L, it isn't found, but thankfully it isn't really needed +sed -i -e 's@list( APPEND COMMON_LINK_LIBS "-lgfortran")@#list( APPEND COMMON_LINK_LIBS "-lgfortran")@' clients/{benchmarks,gtest}/CMakeLists.txt +%endif + +%build + +# With compat llvm the system clang is wrong +CLANG_PATH=`hipconfig --hipclangpath` +export TENSILE_ROCM_ASSEMBLER_PATH=${CLANG_PATH}/clang++ +export TENSILE_ROCM_OFFLOAD_BUNDLER_PATH=${CLANG_PATH}/clang-offload-bundler +# Work around problem with koji's ld +export HIPCC_LINK_FLAGS_APPEND=-fuse-ld=lld + +%if %{with tensile} +TP=`/usr/bin/TensileGetPath` +%endif + +CORES=`lscpu | grep 'Core(s)' | awk '{ print $4 }'` +if [ ${CORES}x = x ]; then + CORES=1 +fi +# Try again.. +if [ ${CORES} = 1 ]; then + CORES=`lscpu | grep '^CPU(s)' | awk '{ print $2 }'` + if [ ${CORES}x = x ]; then + CORES=4 + fi +fi + +%if %{with gfx950} + +module load rocm/gfx950 + +%cmake %{cmake_generator} %{cmake_config} \ + -DGPU_TARGETS=${ROCM_GPUS} \ + -DBUILD_WITH_TENSILE=OFF \ + -DCMAKE_INSTALL_BINDIR=${ROCM_BIN} \ + -DCMAKE_INSTALL_INCLUDEDIR=${ROCM_INCLUDE} \ + -DCMAKE_INSTALL_LIBDIR=${ROCM_LIB} + +%else + +%if %{build_tensile_separately} && %{without tensile_package} +export TENSILE_SKIP_LIBRARY=true +%endif + +%cmake %{cmake_generator} %{cmake_config} \ + -DGPU_TARGETS=%{gpu_list} \ + -DBUILD_WITH_TENSILE=%{build_tensile} \ + -DCMAKE_INSTALL_LIBDIR=%_libdir \ + +%endif + +%cmake_build %{?with_tensile_package:TENSILE_LIBRARY_TARGET} +%if %{with gfx950} +module purge +%endif + +%install +%if %{with tensile_package} +DESTDIR=%{buildroot} /usr/bin/cmake -P build/library/src/TensileInstall/cmake_install.cmake +%else +%cmake_install + +if [ -f %{buildroot}%{_prefix}/share/doc/rocblas/LICENSE.md ]; then + rm %{buildroot}%{_prefix}/share/doc/rocblas/LICENSE.md +fi +%endif + +%check +%if %{without tensile_package} +%if %{with test} +%if %{with check} +%if 0%{?suse_version} +export LD_LIBRARY_PATH=%{__builddir}/library/src:$LD_LIBRARY_PATH +%{__builddir}/clients/staging/rocblas-test --gtest_brief=1 +%else +export LD_LIBRARY_PATH=%{_vpath_builddir}/library/src:$LD_LIBRARY_PATH +%{_vpath_builddir}/clients/staging/rocblas-test --gtest_brief=1 +%endif +%endif +%endif +%endif + +%if %{with gfx950} +%files gfx950 +%license LICENSE.md +%{_libdir}/rocm/gfx950/lib/librocblas.so.4{,.*} + +%files gfx950-devel +%dir %{_libdir}/rocm/gfx950/include/rocblas +%dir %{_libdir}/rocm/gfx950/lib/cmake/rocblas +%{_libdir}/rocm/gfx950/include/rocblas/rocblas_module.f90 +%{_libdir}/rocm/gfx950/lib/librocblas.so +%{_libdir}/rocm/gfx950/lib/cmake/rocblas/*.cmake + +%if %{with test} +%files gfx950-test +%{_libdir}/rocm/gfx950/bin/rocblas* +%endif + +%else + +%if %{without tensile_package} +%files +%license LICENSE.md +%{_libdir}/librocblas.so.4{,.*} +%if %{with tensile} +%if ! %{build_tensile_separately} +%dir %{_libdir}/rocblas +%dir %{_libdir}/rocblas/library +%{_libdir}/rocblas/library/Kernels* +%{_libdir}/rocblas/library/Tensile* +%endif +%endif # with tensile + +%files devel +%doc README.md +%dir %{_libdir}/cmake/rocblas +%dir %{_includedir}/rocblas +%{_includedir}/rocblas/* +%{_libdir}/cmake/rocblas/*.cmake +%{_libdir}/librocblas.so + +%if %{with test} +%files test +%{_bindir}/rocblas* +%endif + +%else # ?tensile_package + +%if %{with tensile} +%files -n rocblas-tensile +%dir %{_libdir}/rocblas +%dir %{_libdir}/rocblas/library +%{_libdir}/rocblas/library/Kernels* +%{_libdir}/rocblas/library/Tensile* +%endif + +%endif # ?tensile_package +%endif # gfx950 + +%changelog +* Thu Jun 12 2025 Egbert Eich - 6.4.0-9 +- Build and package core library and arch dependent + tensile modules separately to parallelize the build. +- Fix build and runtime dependencies of test package. +- Restructure spec file (move bcond_with* settings to + the top). +- Add rpmlintrc for SUSE. + +* Wed Jun 11 2025 Tom Rix - 6.4.0-8 +- Remove suse check for using ldconfig + +* Sun May 11 2025 Tom Rix - 6.4.0-7 +- Add experimental gfx950 + +* Tue May 6 2025 Tom Rix - 6.4.0-6 +- disable roctracer for everyone + +* Tue Apr 29 2025 Tim Flink - 6.4.0-5 +- add patch for option to disable roctracer logging +- disable roctracer logging for rhel builds +- allow for builds on rhel with ninja + +* Tue Apr 29 2025 Tom Rix - 6.4.0-4 +- Improve testing for suse + +* Sat Apr 26 2025 Tom Rix - 6.4.0-3 +- Add generic gpus + +* Wed Apr 23 2025 Tom Rix - 6.4.0-2 +- Use joblib on sle 15.6 and 16.0 + +* Fri Apr 18 2025 Tom Rix - 6.4.0-1 +- Update to 6.4.0 + +* Thu Apr 10 2025 Tom Rix - 6.3.0-12 +- Reenable ninja + +* Fri Apr 4 2025 Tom Rix - 6.3.0-11 +- Use rocm-llvm strip + +* Thu Feb 27 2025 Cristian Le - 6.3.0-10 +- Add hip requirement to devel package + +* Thu Feb 27 2025 Tom Rix - 6.3.0-9 +- Enable tensile for RHEL + +* Wed Feb 26 2025 Tom Rix - 6.3.0-8 +- Enable tensile for SUSE + +* Sun Feb 23 2025 Tom Rix - 6.3.0-7 +- Use tensile verbosity to avoid OSB timeout + +* Wed Feb 19 2025 Tom Rix - 6.3.0-6 +- Use tensile cmake from the python location + +* Tue Feb 11 2025 Tom Rix - 6.3.0-5 +- Remove multibuild +- Fix SLE 15.6 + +* Sat Jan 18 2025 Tom Rix - 6.3.0-4 +- multithread rpm compress + +* Tue Jan 14 2025 Tom Rix - 6.3.0-3 +- build requires gcc-c++ + +* Fri Dec 20 2024 Tom Rix - 6.3.0-2 +- Build type should be release + +* Fri Dec 6 2024 Tom Rix - 6.3.0-1 +- Update to 6.3 + +* Sun Nov 10 2024 Tom Rix - 6.2.1-1 +- Stub for tumbleweed + + -- 2.51.1